From 8e40fe7e6b01a399c3ea406a59d4cbcbc9bfce5c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 16:08:42 -0700
Subject: [PATCH] Remove unused parsing utilities (#15955)

Some parsing utilities have been unused since legacy JSON removal.
This PR removes these functions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15955
---
 cpp/CMakeLists.txt                     |   1 -
 cpp/src/io/utilities/parsing_utils.cu  | 221 -------------------------
 cpp/src/io/utilities/parsing_utils.cuh |  76 ---------
 3 files changed, 298 deletions(-)
 delete mode 100644 cpp/src/io/utilities/parsing_utils.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f637db66c2c..ca85996b990 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -439,7 +439,6 @@ add_library(
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
-  src/io/utilities/parsing_utils.cu
   src/io/utilities/row_selection.cpp
   src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
deleted file mode 100644
index cb8be380c5b..00000000000
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <thrust/pair.h>
-
-#include <algorithm>
-
-namespace cudf {
-namespace io {
-namespace {
-// When processing the input in chunks, this is the maximum size of each chunk.
-// Only one chunk is loaded on the GPU at a time, so this value is chosen to
-// be small enough to fit on the GPU in most cases.
-constexpr size_t max_chunk_bytes = 256 * 1024 * 1024;  // 256MB
-
-constexpr int bytes_per_find_thread = 64;
-
-using pos_key_pair = thrust::pair<uint64_t, char>;
-
-template <typename T>
-constexpr T divCeil(T dividend, T divisor) noexcept
-{
-  return (dividend + divisor - 1) / divisor;
-}
-
-/**
- * @brief Sets the specified element of the array to the passed value
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&)
-{
-  array[idx] = t;
-}
-
-/**
- * @brief Sets the specified element of the array of pairs using the two passed
- * parameters.
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
-                                           cudf::size_type idx,
-                                           T const& t,
-                                           V const& v)
-{
-  array[idx] = {t, v};
-}
-
-/**
- * @brief Overloads the setElement() functions for void* arrays.
- * Does not do anything, indexing is not allowed with void* arrays.
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&)
-{
-}
-
-/**
- * @brief CUDA kernel that finds all occurrences of a character in the given
- * character array. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output array.
- *
- * @param[in] data Pointer to the input character array
- * @param[in] size Number of bytes in the input array
- * @param[in] offset Offset to add to the output positions
- * @param[in] key Character to find in the array
- * @param[in,out] count Pointer to the number of found occurrences
- * @param[out] positions Array containing the output positions
- */
-template <class T>
-CUDF_KERNEL void count_and_set_positions(char const* data,
-                                         uint64_t size,
-                                         uint64_t offset,
-                                         char const key,
-                                         cudf::size_type* count,
-                                         T* positions)
-{
-  // thread IDs range per block, so also need the block id
-  auto const tid = cudf::detail::grid_1d::global_thread_id();
-  auto const did = tid * bytes_per_find_thread;
-
-  char const* raw = (data + did);
-
-  long const byteToProcess =
-    ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did);
-
-  // Process the data
-  for (long i = 0; i < byteToProcess; i++) {
-    if (raw[i] == key) {
-      auto const idx = atomicAdd(count, static_cast<cudf::size_type>(1));
-      setElement(positions, idx, did + offset + i, key);
-    }
-  }
-}
-
-}  // namespace
-
-template <class T>
-cudf::size_type find_all_from_set(device_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream)
-{
-  int block_size    = 0;  // suggested thread count to use
-  int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-  int const grid_size = divCeil(data.size(), (size_t)block_size);
-
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
-    1, stream, rmm::mr::get_current_device_resource());
-  for (char key : keys) {
-    count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
-      data.data(), data.size(), result_offset, key, d_count.data(), positions);
-  }
-
-  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
-}
-
-template <class T>
-cudf::size_type find_all_from_set(host_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream)
-{
-  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
-    1, stream, rmm::mr::get_current_device_resource());
-
-  int block_size    = 0;  // suggested thread count to use
-  int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-
-  size_t const chunk_count = divCeil(data.size(), max_chunk_bytes);
-  for (size_t ci = 0; ci < chunk_count; ++ci) {
-    auto const chunk_offset = ci * max_chunk_bytes;
-    auto const h_chunk      = data.data() + chunk_offset;
-    int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
-    auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
-    int const grid_size   = divCeil(chunk_bits, block_size);
-
-    // Copy chunk to device
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value()));
-
-    for (char key : keys) {
-      count_and_set_positions<T>
-        <<<grid_size, block_size, 0, stream.value()>>>(static_cast<char*>(d_chunk.data()),
-                                                       chunk_bytes,
-                                                       chunk_offset + result_offset,
-                                                       key,
-                                                       d_count.data(),
-                                                       positions);
-    }
-  }
-
-  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
-}
-
-template cudf::size_type find_all_from_set<uint64_t>(device_span<char const> data,
-                                                     std::vector<char> const& keys,
-                                                     uint64_t result_offset,
-                                                     uint64_t* positions,
-                                                     rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<pos_key_pair>(device_span<char const> data,
-                                                         std::vector<char> const& keys,
-                                                         uint64_t result_offset,
-                                                         pos_key_pair* positions,
-                                                         rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<uint64_t>(host_span<char const> data,
-                                                     std::vector<char> const& keys,
-                                                     uint64_t result_offset,
-                                                     uint64_t* positions,
-                                                     rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<pos_key_pair>(host_span<char const> data,
-                                                         std::vector<char> const& keys,
-                                                         uint64_t result_offset,
-                                                         pos_key_pair* positions,
-                                                         rmm::cuda_stream_view stream);
-
-cudf::size_type count_all_from_set(device_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream)
-{
-  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
-}
-
-cudf::size_type count_all_from_set(host_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream)
-{
-  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
-}
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index faee05541cc..bc2722441d0 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
 
 }  // namespace gpu
 
-/**
- * @brief Searches the input character array for each of characters in a set.
- * Sums up the number of occurrences. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output device array.
- *
- * @param[in] d_data Input character array in device memory
- * @param[in] keys Vector containing the keys to count in the buffer
- * @param[in] result_offset Offset to add to the output positions
- * @param[out] positions Array containing the output positions
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-template <class T>
-cudf::size_type find_all_from_set(device_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set.
- * Sums up the number of occurrences. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output device array.
- *
- * Does not load the entire file into the GPU memory at any time, so it can
- * be used to parse large files. Output array needs to be preallocated.
- *
- * @param[in] h_data Pointer to the input character array
- * @param[in] h_size Number of bytes in the input array
- * @param[in] keys Vector containing the keys to count in the buffer
- * @param[in] result_offset Offset to add to the output positions
- * @param[out] positions Array containing the output positions
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-template <class T>
-cudf::size_type find_all_from_set(host_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set
- * and sums up the number of occurrences.
- *
- * @param d_data Input data buffer in device memory
- * @param keys Vector containing the keys to count in the buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-cudf::size_type count_all_from_set(device_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set
- * and sums up the number of occurrences.
- *
- * Does not load the entire buffer into the GPU memory at any time, so it can
- * be used with buffers of any size.
- *
- * @param h_data Pointer to the data in host memory
- * @param h_size Size of the input data, in bytes
- * @param keys Vector containing the keys to count in the buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-cudf::size_type count_all_from_set(host_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *