From 8e40fe7e6b01a399c3ea406a59d4cbcbc9bfce5c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 16:08:42 -0700 Subject: [PATCH] Remove unused parsing utilities (#15955) Some parsing utilities have been unused since legacy JSON removal. This PR removes these functions. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15955 --- cpp/CMakeLists.txt | 1 - cpp/src/io/utilities/parsing_utils.cu | 221 ------------------------- cpp/src/io/utilities/parsing_utils.cuh | 76 --------- 3 files changed, 298 deletions(-) delete mode 100644 cpp/src/io/utilities/parsing_utils.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f637db66c2c..ca85996b990 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -439,7 +439,6 @@ add_library( src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp - src/io/utilities/parsing_utils.cu src/io/utilities/row_selection.cpp src/io/utilities/type_inference.cu src/io/utilities/trie.cu diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu deleted file mode 100644 index cb8be380c5b..00000000000 --- a/cpp/src/io/utilities/parsing_utils.cu +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -#include - -#include - -namespace cudf { -namespace io { -namespace { -// When processing the input in chunks, this is the maximum size of each chunk. -// Only one chunk is loaded on the GPU at a time, so this value is chosen to -// be small enough to fit on the GPU in most cases. -constexpr size_t max_chunk_bytes = 256 * 1024 * 1024; // 256MB - -constexpr int bytes_per_find_thread = 64; - -using pos_key_pair = thrust::pair; - -template -constexpr T divCeil(T dividend, T divisor) noexcept -{ - return (dividend + divisor - 1) / divisor; -} - -/** - * @brief Sets the specified element of the array to the passed value - */ -template -__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&) -{ - array[idx] = t; -} - -/** - * @brief Sets the specified element of the array of pairs using the two passed - * parameters. - */ -template -__device__ __forceinline__ void setElement(thrust::pair* array, - cudf::size_type idx, - T const& t, - V const& v) -{ - array[idx] = {t, v}; -} - -/** - * @brief Overloads the setElement() functions for void* arrays. - * Does not do anything, indexing is not allowed with void* arrays. - */ -template -__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&) -{ -} - -/** - * @brief CUDA kernel that finds all occurrences of a character in the given - * character array. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output array. - * - * @param[in] data Pointer to the input character array - * @param[in] size Number of bytes in the input array - * @param[in] offset Offset to add to the output positions - * @param[in] key Character to find in the array - * @param[in,out] count Pointer to the number of found occurrences - * @param[out] positions Array containing the output positions - */ -template -CUDF_KERNEL void count_and_set_positions(char const* data, - uint64_t size, - uint64_t offset, - char const key, - cudf::size_type* count, - T* positions) -{ - // thread IDs range per block, so also need the block id - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const did = tid * bytes_per_find_thread; - - char const* raw = (data + did); - - long const byteToProcess = - ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did); - - // Process the data - for (long i = 0; i < byteToProcess; i++) { - if (raw[i] == key) { - auto const idx = atomicAdd(count, static_cast(1)); - setElement(positions, idx, did + offset + i, key); - } - } -} - -} // namespace - -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - int const grid_size = divCeil(data.size(), (size_t)block_size); - - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - for (char key : keys) { - count_and_set_positions<<>>( - data.data(), data.size(), result_offset, key, d_count.data(), positions); - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream) -{ - rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream); - auto d_count = cudf::detail::make_zeroed_device_uvector_async( - 1, stream, rmm::mr::get_current_device_resource()); - - int block_size = 0; // suggested thread count to use - int min_grid_size = 0; // minimum block count required - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions)); - - size_t const chunk_count = divCeil(data.size(), max_chunk_bytes); - for (size_t ci = 0; ci < chunk_count; ++ci) { - auto const chunk_offset = ci * max_chunk_bytes; - auto const h_chunk = data.data() + chunk_offset; - int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes); - auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread); - int const grid_size = divCeil(chunk_bits, block_size); - - // Copy chunk to device - CUDF_CUDA_TRY( - cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value())); - - for (char key : keys) { - count_and_set_positions - <<>>(static_cast(d_chunk.data()), - chunk_bytes, - chunk_offset + result_offset, - key, - d_count.data(), - positions); - } - } - - return cudf::detail::make_std_vector_sync(d_count, stream)[0]; -} - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - uint64_t* positions, - rmm::cuda_stream_view stream); - -template cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - pos_key_pair* positions, - rmm::cuda_stream_view stream); - -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream) -{ - return find_all_from_set(data, keys, 0, nullptr, stream); -} - -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index faee05541cc..bc2722441d0 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const* } // namespace gpu -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * @param[in] d_data Input character array in device memory - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(device_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set. - * Sums up the number of occurrences. If the 'positions' parameter is not void*, - * positions of all occurrences are stored in the output device array. - * - * Does not load the entire file into the GPU memory at any time, so it can - * be used to parse large files. Output array needs to be preallocated. - * - * @param[in] h_data Pointer to the input character array - * @param[in] h_size Number of bytes in the input array - * @param[in] keys Vector containing the keys to count in the buffer - * @param[in] result_offset Offset to add to the output positions - * @param[out] positions Array containing the output positions - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -template -cudf::size_type find_all_from_set(host_span data, - std::vector const& keys, - uint64_t result_offset, - T* positions, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * @param d_data Input data buffer in device memory - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(device_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - -/** - * @brief Searches the input character array for each of characters in a set - * and sums up the number of occurrences. - * - * Does not load the entire buffer into the GPU memory at any time, so it can - * be used with buffers of any size. - * - * @param h_data Pointer to the data in host memory - * @param h_size Size of the input data, in bytes - * @param keys Vector containing the keys to count in the buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return cudf::size_type total number of occurrences - */ -cudf::size_type count_all_from_set(host_span data, - std::vector const& keys, - rmm::cuda_stream_view stream); - /** * @brief Checks whether the given character is a whitespace character. *