From d10eae79ec06ec002a34420a34380e077c4540f7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:01:00 -0500 Subject: [PATCH 01/14] Update strings/text source to use grid_1d for thread/block/stride calculations (#17404) Replaces `threadIdx.x + blockDim.x * blockIdx.x` logic with `grid_1d::global_thread_id()` and `blockDim.x * gridDim.x` with `grid_1d::grid_stride()` in libcudf strings and text source. Reference #10368 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17404 --- cpp/include/cudf/strings/detail/gather.cuh | 10 +++++----- cpp/src/strings/convert/convert_urls.cu | 6 ++++-- cpp/src/strings/copying/concatenate.cu | 10 +++++----- cpp/src/strings/regex/utilities.cuh | 4 ++-- cpp/src/strings/search/find.cu | 16 +++++++--------- cpp/src/text/minhash.cu | 9 +++------ cpp/src/text/subword/data_normalizer.cu | 5 ++--- cpp/src/text/subword/subword_tokenize.cu | 11 +++-------- cpp/src/text/subword/wordpiece_tokenizer.cu | 14 ++++---------- cpp/src/text/vocabulary_tokenize.cu | 9 +++------ 10 files changed, 38 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 4216523df97..58665fbf27e 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -85,15 +85,15 @@ CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin, constexpr size_t out_datatype_size = sizeof(uint4); constexpr size_t in_datatype_size = sizeof(uint); - int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int global_warp_id = global_thread_id / cudf::detail::warp_size; - int warp_lane = global_thread_id % cudf::detail::warp_size; - int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = global_thread_id / cudf::detail::warp_size; + auto const warp_lane = global_thread_id % cudf::detail::warp_size; + auto const nwarps = cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size; auto const alignment_offset = reinterpret_cast(out_chars) % out_datatype_size; uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); - for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { + for (auto istring = global_warp_id; istring < total_out_strings; istring += nwarps) { auto const out_start = out_offsets[istring]; auto const out_end = out_offsets[istring + 1]; diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 520f5897415..9d0186b7a51 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -207,7 +207,8 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); + auto const nwarps = + static_cast(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp. @@ -293,7 +294,8 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); - auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); + auto const nwarps = + static_cast(cudf::detail::grid_1d::grid_stride() / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 9e4ef47ff79..3712b0e7fc6 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -123,8 +123,8 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( bitmask_type* output_mask, size_type* out_valid_count) { - cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x; - size_type warp_valid_count = 0; + auto output_index = cudf::detail::grid_1d::global_thread_id(); + size_type warp_valid_count = 0; unsigned active_mask; if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); } @@ -156,7 +156,7 @@ CUDF_KERNEL void fused_concatenate_string_offset_kernel( warp_valid_count += __popc(new_word); } - output_index += blockDim.x * gridDim.x; + output_index += cudf::detail::grid_1d::grid_stride(); if (Nullable) { active_mask = __ballot_sync(active_mask, output_index < output_size); } } @@ -178,7 +178,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* size_type const output_size, char* output_data) { - cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x; + auto output_index = cudf::detail::grid_1d::global_thread_id(); while (output_index < output_size) { // Lookup input index by searching for output index in offsets @@ -198,7 +198,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* auto const first_char = input_offsets_data[input_view.offset()]; output_data[output_index] = input_chars_data[offset_index + first_char]; - output_index += blockDim.x * gridDim.x; + output_index += cudf::detail::grid_1d::grid_stride(); } } diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh index 679907788bb..2594fd7b6da 100644 --- a/cpp/src/strings/regex/utilities.cuh +++ b/cpp/src/strings/regex/utilities.cuh @@ -47,7 +47,7 @@ CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, __syncthreads(); auto const s_prog = reprog_device::load(d_prog, shmem); - auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + auto const thread_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = s_prog.thread_count(); if (thread_idx < stride) { for (auto idx = thread_idx; idx < size; idx += stride) { @@ -84,7 +84,7 @@ CUDF_KERNEL void transform_kernel(TransformFunction fn, __syncthreads(); auto const s_prog = reprog_device::load(d_prog, shmem); - auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x; + auto const thread_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = s_prog.thread_count(); if (thread_idx < stride) { for (auto idx = thread_idx; idx < size; idx += stride) { diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 9bd1abb5542..3cf4970d36e 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -121,11 +121,10 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings, size_type const stop, size_type* d_results) { - size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto const idx = cudf::detail::grid_1d::global_thread_id(); - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = idx % cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } @@ -350,13 +349,12 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, string_view const d_target, bool* d_results) { - size_type const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - using warp_reduce = cub::WarpReduce; + auto const idx = cudf::detail::grid_1d::global_thread_id(); + using warp_reduce = cub::WarpReduce; __shared__ typename warp_reduce::TempStorage temp_storage; - if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; } - - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / cudf::detail::warp_size; + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = idx % cudf::detail::warp_size; if (d_strings.is_null(str_idx)) { return; } // get the string for this warp diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index aee83ab35ed..b7a719a2041 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -74,13 +74,10 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, cudf::size_type width, hash_value_type* d_hashes) { - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size()) * - static_cast(cudf::detail::warp_size))) { - return; - } + auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = static_cast(idx % cudf::detail::warp_size); if (d_strings.is_null(str_idx)) { return; } diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index c662581b3f4..a3bed45e4bd 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -217,9 +217,8 @@ CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings, constexpr uint32_t init_val = (1 << FILTER_BIT); uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; - cudf::thread_index_type const char_for_thread = - threadIdx.x + cudf::thread_index_type(blockIdx.x) * cudf::thread_index_type(blockDim.x); - uint32_t num_new_chars = 0; + auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); + uint32_t num_new_chars = 0; if (char_for_thread < total_bytes) { auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread); diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index dee589d6daf..6302b478c14 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -73,15 +73,10 @@ CUDF_KERNEL void kernel_compute_tensor_metadata( uint32_t* attn_mask, uint32_t* metadata) { - cudf::thread_index_type const output_idx = - threadIdx.x + static_cast(blockIdx.x) * - static_cast(blockDim.x); - if (output_idx >= (static_cast(nrows_tensor_token_ids) * - static_cast(max_sequence_length))) { - return; - } + auto const output_idx = cudf::detail::grid_1d::global_thread_id(); - uint32_t const absolute_row_id = output_idx / max_sequence_length; + uint32_t const absolute_row_id = output_idx / max_sequence_length; + if (absolute_row_id >= nrows_tensor_token_ids) { return; } uint32_t const tensor_id = row2tensor[absolute_row_id]; uint32_t const row_within_tensor = row2row_within_tensor[absolute_row_id]; uint32_t const offset_token_ids_tensor = offsets[tensor_id]; diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index c094537ebc2..dd1e8ddb027 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -83,9 +83,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi uint32_t* token_ids, uint8_t* tokens_per_word) { - cudf::thread_index_type char_for_thread = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); // Deal with the start_word_indices array if (char_for_thread < num_code_points) { @@ -138,9 +136,7 @@ CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points, uint32_t* end_word_indices, uint32_t num_strings) { - cudf::thread_index_type idx = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const idx = cudf::detail::grid_1d::global_thread_id(); // Ensure the starting character of each strings is written to the word start array. if (idx <= num_strings) { auto const offset = strings_offsets[idx]; @@ -335,11 +331,9 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, uint32_t* token_ids, uint8_t* tokens_per_word) { - cudf::thread_index_type word_to_tokenize = static_cast(blockDim.x) * - static_cast(blockIdx.x) + - threadIdx.x; + auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id(); - if (word_to_tokenize >= total_words) return; + if (word_to_tokenize >= total_words) { return; } // Each thread gets the start code_point offset for each word and resets the token_id memory to // the default value. In a post processing step, all of these values will be removed. auto const token_start = word_starts[word_to_tokenize]; diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index a2297987732..caf2b1d8b30 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -222,12 +222,9 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings, int8_t* d_results) { // string per warp - auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - if (idx >= (static_cast(d_strings.size()) * - static_cast(cudf::detail::warp_size))) { - return; - } - auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + if (str_idx >= d_strings.size()) { return; } auto const lane_idx = static_cast(idx % cudf::detail::warp_size); if (d_strings.is_null(str_idx)) { From e7022fbc22eda538783e67f32d35ea8ea0798be8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 10:01:53 -0500 Subject: [PATCH 02/14] Use thread_index_type in binary-ops jit kernel.cu (#17420) Follow on to #17354 to prevent overflow in jit kernel binary-ops. This uses the `thread_index_type` directly since the `detail/utilities/cuda.cuh` cannot be included in the jit'd kernel source. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17420 --- cpp/src/binaryop/jit/kernel.cu | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu index 985fc87521c..1133e9ac22e 100644 --- a/cpp/src/binaryop/jit/kernel.cu +++ b/cpp/src/binaryop/jit/kernel.cu @@ -51,15 +51,10 @@ CUDF_KERNEL void kernel_v_v(cudf::size_type size, TypeLhs* lhs_data, TypeRhs* rhs_data) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; + auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; + auto const step = static_cast(blockDim.x) * gridDim.x; - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += step) { out_data[i] = TypeOpe::template operate(lhs_data[i], rhs_data[i]); } } @@ -75,15 +70,10 @@ CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size, cudf::bitmask_type const* rhs_mask, cudf::size_type rhs_offset) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; + auto const start = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; + auto const step = static_cast(blockDim.x) * gridDim.x; - for (cudf::size_type i = start; i < size; i += step) { + for (auto i = start; i < size; i += step) { bool output_valid = false; out_data[i] = TypeOpe::template operate( lhs_data[i], From f5954a44ce86af82b7750f64e511d063e35e9625 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Nov 2024 12:04:36 -0500 Subject: [PATCH 03/14] Remove nvtx/ranges.hpp include from cuda.cuh (#17427) Removes unused header include `nvtx/ranges.hpp` from `cuda.cuh` and fixes up all the source files that were dependent on this include. Found while trying to include `cuda.cuh` in a jit'd kernel source. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17427 --- cpp/include/cudf/detail/utilities/cuda.cuh | 6 ++---- cpp/src/copying/contiguous_split.cu | 1 + cpp/src/join/conditional_join.cu | 1 + cpp/src/join/hash_join.cu | 1 + cpp/src/join/join.cu | 1 + cpp/src/join/mixed_join.cu | 1 + cpp/src/join/mixed_join_semi.cu | 1 + cpp/src/json/json_path.cu | 1 + cpp/src/labeling/label_bins.cu | 1 + cpp/src/lists/contains.cu | 1 + cpp/src/lists/copying/segmented_gather.cu | 1 + cpp/src/quantiles/tdigest/tdigest.cu | 1 + cpp/src/reductions/minmax.cu | 1 + cpp/src/rolling/detail/rolling_fixed_window.cu | 1 + cpp/src/rolling/detail/rolling_variable_window.cu | 1 + cpp/src/rolling/grouped_rolling.cu | 1 + cpp/src/strings/copying/concatenate.cu | 1 + 17 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index d31ca3d92d1..61a8e9f7ec3 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -25,8 +24,7 @@ #include #include - -#include +#include namespace cudf { namespace detail { @@ -164,7 +162,7 @@ template __device__ T single_lane_block_sum_reduce(T lane_value) { static_assert(block_size <= 1024, "Invalid block size."); - static_assert(std::is_arithmetic_v, "Invalid non-arithmetic type."); + static_assert(cuda::std::is_arithmetic_v, "Invalid non-arithmetic type."); constexpr auto warps_per_block{block_size / warp_size}; auto const lane_id{threadIdx.x % warp_size}; auto const warp_id{threadIdx.x / warp_size}; diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 15aa31ff5ee..e9443980320 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 781fda215fd..81287e9a3fd 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index beeaabfdaab..05b85fed1a8 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 7b13c260364..bbafb159caf 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -16,6 +16,7 @@ #include "join_common_utils.hpp" #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 90b0d0a45ad..56044bb1264 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 62ba558b0bd..6c37f801693 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index fb5cf66dd60..fd8629ed6f3 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu index 18a500069ad..40a48d919cd 100644 --- a/cpp/src/labeling/label_bins.cu +++ b/cpp/src/labeling/label_bins.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 9556ef23784..03fbd8e5e89 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index f6e48f141e1..9d11035cfdc 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index fb5aebb4b39..3a365477366 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -18,6 +18,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 4f6eb23ce5b..98fd9f679c8 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu index 23424da13cd..0603f27852a 100644 --- a/cpp/src/rolling/detail/rolling_fixed_window.cu +++ b/cpp/src/rolling/detail/rolling_fixed_window.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu index c2324947ef6..d4851df740b 100644 --- a/cpp/src/rolling/detail/rolling_variable_window.cu +++ b/cpp/src/rolling/detail/rolling_variable_window.cu @@ -17,6 +17,7 @@ #include "rolling.cuh" #include +#include #include #include diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index ac6c7b11ef5..3cf292f5abb 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -21,6 +21,7 @@ #include "detail/rolling_jit.hpp" #include +#include #include #include #include diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 3712b0e7fc6..ba96e2cb988 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include From b89728b7ccdb7f39b70087eccc2c8c36765742bd Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 26 Nov 2024 18:10:45 +0100 Subject: [PATCH 04/14] Abstract polars function expression nodes to ensure they are serializable (#17418) Use `Enum`s to define Python types as references to `polars.polars._expr_nodes.*Function` as to ensure `cudf_polars.dsl.expressions` specializations of `Expr` are serializable. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17418 --- .../cudf_polars/dsl/expressions/boolean.py | 77 +++++++--- .../cudf_polars/dsl/expressions/datetime.py | 98 ++++++++++--- .../cudf_polars/dsl/expressions/string.py | 135 +++++++++++++----- python/cudf_polars/cudf_polars/dsl/to_ast.py | 10 +- .../cudf_polars/cudf_polars/dsl/translate.py | 14 +- .../tests/dsl/test_serialization.py | 56 ++++++++ 6 files changed, 304 insertions(+), 86 deletions(-) create mode 100644 python/cudf_polars/tests/dsl/test_serialization.py diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 8db8172ebd1..1682e7a8a9c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -6,13 +6,12 @@ from __future__ import annotations +from enum import IntEnum, auto from functools import partial, reduce from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from cudf_polars.containers import Column @@ -24,7 +23,10 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + import polars.type_aliases as pl_types + from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import DataFrame @@ -32,13 +34,46 @@ class BooleanFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `BooleanFunction`.""" + + All = auto() + AllHorizontal = auto() + Any = auto() + AnyHorizontal = auto() + IsBetween = auto() + IsDuplicated = auto() + IsFinite = auto() + IsFirstDistinct = auto() + IsIn = auto() + IsInfinite = auto() + IsLastDistinct = auto() + IsNan = auto() + IsNotNan = auto() + IsNotNull = auto() + IsNull = auto() + IsUnique = auto() + Not = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self: + """Convert from polars' `BooleanFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "BooleanFunction": + raise ValueError("BooleanFunction required") + return getattr(cls, name) + __slots__ = ("name", "options") _non_child = ("dtype", "name", "options") def __init__( self, dtype: plc.DataType, - name: pl_expr.BooleanFunction, + name: BooleanFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -46,7 +81,7 @@ def __init__( self.options = options self.name = name self.children = children - if self.name == pl_expr.BooleanFunction.IsIn and not all( + if self.name is BooleanFunction.Name.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): # TODO: If polars IR doesn't put the casts in, we need to @@ -110,12 +145,12 @@ def do_evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" if self.name in ( - pl_expr.BooleanFunction.IsFinite, - pl_expr.BooleanFunction.IsInfinite, + BooleanFunction.Name.IsFinite, + BooleanFunction.Name.IsInfinite, ): # Avoid evaluating the child if the dtype tells us it's unnecessary. (child,) = self.children - is_finite = self.name == pl_expr.BooleanFunction.IsFinite + is_finite = self.name is BooleanFunction.Name.IsFinite if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): value = plc.interop.from_arrow( pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) @@ -142,10 +177,10 @@ def do_evaluate( ] # Kleene logic for Any (OR) and All (AND) if ignore_nulls is # False - if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All): (ignore_nulls,) = self.options (column,) = columns - is_any = self.name == pl_expr.BooleanFunction.Any + is_any = self.name is BooleanFunction.Name.Any agg = plc.aggregation.any() if is_any else plc.aggregation.all() result = plc.reduce.reduce(column.obj, agg, self.dtype) if not ignore_nulls and column.obj.null_count() > 0: @@ -165,27 +200,27 @@ def do_evaluate( # False || Null => Null True && Null => Null return Column(plc.Column.all_null_like(column.obj, 1)) return Column(plc.Column.from_scalar(result, 1)) - if self.name == pl_expr.BooleanFunction.IsNull: + if self.name is BooleanFunction.Name.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNotNull: + elif self.name is BooleanFunction.Name.IsNotNull: (column,) = columns return Column(plc.unary.is_valid(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNan: + elif self.name is BooleanFunction.Name.IsNan: (column,) = columns return Column( plc.unary.is_nan(column.obj).with_mask( column.obj.null_mask(), column.obj.null_count() ) ) - elif self.name == pl_expr.BooleanFunction.IsNotNan: + elif self.name is BooleanFunction.Name.IsNotNan: (column,) = columns return Column( plc.unary.is_not_nan(column.obj).with_mask( column.obj.null_mask(), column.obj.null_count() ) ) - elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + elif self.name is BooleanFunction.Name.IsFirstDistinct: (column,) = columns return self._distinct( column, @@ -197,7 +232,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + elif self.name is BooleanFunction.Name.IsLastDistinct: (column,) = columns return self._distinct( column, @@ -209,7 +244,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsUnique: + elif self.name is BooleanFunction.Name.IsUnique: (column,) = columns return self._distinct( column, @@ -221,7 +256,7 @@ def do_evaluate( pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.IsDuplicated: + elif self.name is BooleanFunction.Name.IsDuplicated: (column,) = columns return self._distinct( column, @@ -233,7 +268,7 @@ def do_evaluate( pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) ), ) - elif self.name == pl_expr.BooleanFunction.AllHorizontal: + elif self.name is BooleanFunction.Name.AllHorizontal: return Column( reduce( partial( @@ -244,7 +279,7 @@ def do_evaluate( (c.obj for c in columns), ) ) - elif self.name == pl_expr.BooleanFunction.AnyHorizontal: + elif self.name is BooleanFunction.Name.AnyHorizontal: return Column( reduce( partial( @@ -255,10 +290,10 @@ def do_evaluate( (c.obj for c in columns), ) ) - elif self.name == pl_expr.BooleanFunction.IsIn: + elif self.name is BooleanFunction.Name.IsIn: needles, haystack = columns return Column(plc.search.contains(haystack.obj, needles.obj)) - elif self.name == pl_expr.BooleanFunction.Not: + elif self.name is BooleanFunction.Name.Not: (column,) = columns return Column( plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index cd8e5c6a4eb..c2dddfd9940 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -6,12 +6,11 @@ from __future__ import annotations +from enum import IntEnum, auto from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from cudf_polars.containers import Column @@ -20,33 +19,94 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.containers import DataFrame __all__ = ["TemporalFunction"] class TemporalFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `TemporalFunction`.""" + + BaseUtcOffset = auto() + CastTimeUnit = auto() + Century = auto() + Combine = auto() + ConvertTimeZone = auto() + DSTOffset = auto() + Date = auto() + Datetime = auto() + DatetimeFunction = auto() + Day = auto() + Duration = auto() + Hour = auto() + IsLeapYear = auto() + IsoYear = auto() + Microsecond = auto() + Millennium = auto() + Millisecond = auto() + Minute = auto() + Month = auto() + MonthEnd = auto() + MonthStart = auto() + Nanosecond = auto() + OffsetBy = auto() + OrdinalDay = auto() + Quarter = auto() + ReplaceTimeZone = auto() + Round = auto() + Second = auto() + Time = auto() + TimeStamp = auto() + ToString = auto() + TotalDays = auto() + TotalHours = auto() + TotalMicroseconds = auto() + TotalMilliseconds = auto() + TotalMinutes = auto() + TotalNanoseconds = auto() + TotalSeconds = auto() + Truncate = auto() + Week = auto() + WeekDay = auto() + WithTimeUnit = auto() + Year = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self: + """Convert from polars' `TemporalFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "TemporalFunction": + raise ValueError("TemporalFunction required") + return getattr(cls, name) + __slots__ = ("name", "options") - _COMPONENT_MAP: ClassVar[ - dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent] - ] = { - pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, - pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, - pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, - pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, - pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, - pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, - pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, - pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, - pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, - pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, - } _non_child = ("dtype", "name", "options") + _COMPONENT_MAP: ClassVar[dict[Name, plc.datetime.DatetimeComponent]] = { + Name.Year: plc.datetime.DatetimeComponent.YEAR, + Name.Month: plc.datetime.DatetimeComponent.MONTH, + Name.Day: plc.datetime.DatetimeComponent.DAY, + Name.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + Name.Hour: plc.datetime.DatetimeComponent.HOUR, + Name.Minute: plc.datetime.DatetimeComponent.MINUTE, + Name.Second: plc.datetime.DatetimeComponent.SECOND, + Name.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + Name.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, + } def __init__( self, dtype: plc.DataType, - name: pl_expr.TemporalFunction, + name: TemporalFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -70,7 +130,7 @@ def do_evaluate( for child in self.children ] (column,) = columns - if self.name == pl_expr.TemporalFunction.Microsecond: + if self.name is TemporalFunction.Name.Microsecond: millis = plc.datetime.extract_datetime_component( column.obj, plc.datetime.DatetimeComponent.MILLISECOND ) @@ -90,7 +150,7 @@ def do_evaluate( plc.types.DataType(plc.types.TypeId.INT32), ) return Column(total_micros) - elif self.name == pl_expr.TemporalFunction.Nanosecond: + elif self.name is TemporalFunction.Name.Nanosecond: millis = plc.datetime.extract_datetime_component( column.obj, plc.datetime.DatetimeComponent.MILLISECOND ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 8b66c9d4676..92c3c658c21 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -6,13 +6,13 @@ from __future__ import annotations +from enum import IntEnum, auto from typing import TYPE_CHECKING, Any import pyarrow as pa import pyarrow.compute as pc from polars.exceptions import InvalidOperationError -from polars.polars import _expr_nodes as pl_expr import pylibcudf as plc @@ -23,19 +23,82 @@ if TYPE_CHECKING: from collections.abc import Mapping + from typing_extensions import Self + + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.containers import DataFrame __all__ = ["StringFunction"] class StringFunction(Expr): + class Name(IntEnum): + """Internal and picklable representation of polars' `StringFunction`.""" + + Base64Decode = auto() + Base64Encode = auto() + ConcatHorizontal = auto() + ConcatVertical = auto() + Contains = auto() + ContainsMany = auto() + CountMatches = auto() + EndsWith = auto() + EscapeRegex = auto() + Extract = auto() + ExtractAll = auto() + ExtractGroups = auto() + Find = auto() + Head = auto() + HexDecode = auto() + HexEncode = auto() + JsonDecode = auto() + JsonPathMatch = auto() + LenBytes = auto() + LenChars = auto() + Lowercase = auto() + PadEnd = auto() + PadStart = auto() + Replace = auto() + ReplaceMany = auto() + Reverse = auto() + Slice = auto() + Split = auto() + SplitExact = auto() + SplitN = auto() + StartsWith = auto() + StripChars = auto() + StripCharsEnd = auto() + StripCharsStart = auto() + StripPrefix = auto() + StripSuffix = auto() + Strptime = auto() + Tail = auto() + Titlecase = auto() + ToDecimal = auto() + ToInteger = auto() + Uppercase = auto() + ZFill = auto() + + @classmethod + def from_polars(cls, obj: pl_expr.StringFunction) -> Self: + """Convert from polars' `StringFunction`.""" + try: + function, name = str(obj).split(".", maxsplit=1) + except ValueError: + # Failed to unpack string + function = None + if function != "StringFunction": + raise ValueError("StringFunction required") + return getattr(cls, name) + __slots__ = ("name", "options", "_regex_program") _non_child = ("dtype", "name", "options") def __init__( self, dtype: plc.DataType, - name: pl_expr.StringFunction, + name: StringFunction.Name, options: tuple[Any, ...], *children: Expr, ) -> None: @@ -47,21 +110,21 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Contains, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Replace, - pl_expr.StringFunction.ReplaceMany, - pl_expr.StringFunction.Slice, - pl_expr.StringFunction.Strptime, - pl_expr.StringFunction.StartsWith, - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - pl_expr.StringFunction.Uppercase, + StringFunction.Name.Contains, + StringFunction.Name.EndsWith, + StringFunction.Name.Lowercase, + StringFunction.Name.Replace, + StringFunction.Name.ReplaceMany, + StringFunction.Name.Slice, + StringFunction.Name.Strptime, + StringFunction.Name.StartsWith, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, + StringFunction.Name.Uppercase, ): raise NotImplementedError(f"String function {self.name}") - if self.name == pl_expr.StringFunction.Contains: + if self.name is StringFunction.Name.Contains: literal, strict = self.options if not literal: if not strict: @@ -82,7 +145,7 @@ def _validate_input(self): raise NotImplementedError( f"Unsupported regex {pattern} for GPU engine." ) from e - elif self.name == pl_expr.StringFunction.Replace: + elif self.name is StringFunction.Name.Replace: _, literal = self.options if not literal: raise NotImplementedError("literal=False is not supported for replace") @@ -93,7 +156,7 @@ def _validate_input(self): raise NotImplementedError( "libcudf replace does not support empty strings" ) - elif self.name == pl_expr.StringFunction.ReplaceMany: + elif self.name is StringFunction.Name.ReplaceMany: (ascii_case_insensitive,) = self.options if ascii_case_insensitive: raise NotImplementedError( @@ -109,12 +172,12 @@ def _validate_input(self): "libcudf replace_many is implemented differently from polars " "for empty strings" ) - elif self.name == pl_expr.StringFunction.Slice: + elif self.name is StringFunction.Name.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) - elif self.name == pl_expr.StringFunction.Strptime: + elif self.name is StringFunction.Name.Strptime: format, _, exact, cache = self.options if cache: raise NotImplementedError("Strptime cache is a CPU feature") @@ -123,9 +186,9 @@ def _validate_input(self): if not exact: raise NotImplementedError("Strptime does not support exact=False") elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, }: if not isinstance(self.children[1], Literal): raise NotImplementedError( @@ -140,7 +203,7 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - if self.name == pl_expr.StringFunction.Contains: + if self.name is StringFunction.Name.Contains: child, arg = self.children column = child.evaluate(df, context=context, mapping=mapping) @@ -157,7 +220,7 @@ def do_evaluate( return Column( plc.strings.contains.contains_re(column.obj, self._regex_program) ) - elif self.name == pl_expr.StringFunction.Slice: + elif self.name is StringFunction.Name.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) assert isinstance(expr_length, Literal) @@ -188,16 +251,16 @@ def do_evaluate( ) ) elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, + StringFunction.Name.StripChars, + StringFunction.Name.StripCharsStart, + StringFunction.Name.StripCharsEnd, }: column, chars = ( c.evaluate(df, context=context, mapping=mapping) for c in self.children ) - if self.name == pl_expr.StringFunction.StripCharsStart: + if self.name is StringFunction.Name.StripCharsStart: side = plc.strings.SideType.LEFT - elif self.name == pl_expr.StringFunction.StripCharsEnd: + elif self.name is StringFunction.Name.StripCharsEnd: side = plc.strings.SideType.RIGHT else: side = plc.strings.SideType.BOTH @@ -207,13 +270,13 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.StringFunction.Lowercase: + if self.name is StringFunction.Name.Lowercase: (column,) = columns return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: + elif self.name is StringFunction.Name.Uppercase: (column,) = columns return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: + elif self.name is StringFunction.Name.EndsWith: column, suffix = columns return Column( plc.strings.find.ends_with( @@ -223,7 +286,7 @@ def do_evaluate( else suffix.obj, ) ) - elif self.name == pl_expr.StringFunction.StartsWith: + elif self.name is StringFunction.Name.StartsWith: column, prefix = columns return Column( plc.strings.find.starts_with( @@ -233,7 +296,7 @@ def do_evaluate( else prefix.obj, ) ) - elif self.name == pl_expr.StringFunction.Strptime: + elif self.name is StringFunction.Name.Strptime: # TODO: ignores ambiguous format, strict, exact, cache = self.options col = self.children[0].evaluate(df, context=context, mapping=mapping) @@ -265,7 +328,7 @@ def do_evaluate( res.columns()[0], self.dtype, format ) ) - elif self.name == pl_expr.StringFunction.Replace: + elif self.name is StringFunction.Name.Replace: column, target, repl = columns n, _ = self.options return Column( @@ -273,7 +336,7 @@ def do_evaluate( column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n ) ) - elif self.name == pl_expr.StringFunction.ReplaceMany: + elif self.name is StringFunction.Name.ReplaceMany: column, target, repl = columns return Column( plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) diff --git a/python/cudf_polars/cudf_polars/dsl/to_ast.py b/python/cudf_polars/cudf_polars/dsl/to_ast.py index acc4b3669af..c3febc833e2 100644 --- a/python/cudf_polars/cudf_polars/dsl/to_ast.py +++ b/python/cudf_polars/cudf_polars/dsl/to_ast.py @@ -8,8 +8,6 @@ from functools import partial, reduce, singledispatch from typing import TYPE_CHECKING, TypeAlias -from polars.polars import _expr_nodes as pl_expr - import pylibcudf as plc from pylibcudf import expressions as plc_expr @@ -185,7 +183,7 @@ def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression: @_to_ast.register def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: - if node.name == pl_expr.BooleanFunction.IsIn: + if node.name is expr.BooleanFunction.Name.IsIn: needles, haystack = node.children if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16: # 16 is an arbitrary limit @@ -204,14 +202,14 @@ def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression: raise NotImplementedError( f"Parquet filters don't support {node.name} on columns" ) - if node.name == pl_expr.BooleanFunction.IsNull: + if node.name is expr.BooleanFunction.Name.IsNull: return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])) - elif node.name == pl_expr.BooleanFunction.IsNotNull: + elif node.name is expr.BooleanFunction.Name.IsNotNull: return plc_expr.Operation( plc_expr.ASTOperator.NOT, plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])), ) - elif node.name == pl_expr.BooleanFunction.Not: + elif node.name is expr.BooleanFunction.Name.Not: return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0])) raise NotImplementedError(f"AST conversion does not support {node.name}") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 9480ce6e535..b1e2de63ba6 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -531,10 +531,16 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex column.dtype, pa.scalar("", type=plc.interop.to_arrow(column.dtype)), ) - return expr.StringFunction(dtype, name, options, column, chars) + return expr.StringFunction( + dtype, + expr.StringFunction.Name.from_polars(name), + options, + column, + chars, + ) return expr.StringFunction( dtype, - name, + expr.StringFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) @@ -551,7 +557,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex ) return expr.BooleanFunction( dtype, - name, + expr.BooleanFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) @@ -571,7 +577,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex } result_expr = expr.TemporalFunction( dtype, - name, + expr.TemporalFunction.Name.from_polars(name), options, *(translator.translate_expr(n=n) for n in node.input), ) diff --git a/python/cudf_polars/tests/dsl/test_serialization.py b/python/cudf_polars/tests/dsl/test_serialization.py new file mode 100644 index 00000000000..7de8f959843 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_serialization.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pickle + +import pytest + +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.dsl.expressions.boolean import BooleanFunction +from cudf_polars.dsl.expressions.datetime import TemporalFunction +from cudf_polars.dsl.expressions.string import StringFunction + + +@pytest.fixture(params=[BooleanFunction, StringFunction, TemporalFunction]) +def function(request): + return request.param + + +def test_function_name_serialization_all_values(function): + # Test serialization and deserialization for all values of function.Name + for name in function.Name: + serialized_name = pickle.dumps(name) + deserialized_name = pickle.loads(serialized_name) + assert deserialized_name is name + + +def test_function_name_invalid(function): + # Test invalid attribute name + with pytest.raises(AttributeError, match="InvalidAttribute"): + assert function.Name.InvalidAttribute is function.Name.InvalidAttribute + + +def test_from_polars_all_names(function): + # Test that all valid names of polars expressions are correctly converted + polars_function = getattr(pl_expr, function.__name__) + polars_names = [name for name in dir(polars_function) if not name.startswith("_")] + # Check names advertised by polars are the same as we advertise + assert set(polars_names) == set(function.Name.__members__) + for name in function.Name: + attr = getattr(polars_function, name.name) + assert function.Name.from_polars(attr) == name + + +def test_from_polars_invalid_attribute(function): + # Test converting from invalid attribute name + with pytest.raises(ValueError, match=f"{function.__name__} required"): + function.Name.from_polars("InvalidAttribute") + + +def test_from_polars_invalid_polars_attribute(function): + # Test converting from polars function with invalid attribute name + with pytest.raises(AttributeError, match="InvalidAttribute"): + function.Name.from_polars(f"{function.__name__}.InvalidAttribute") From 165d756f7f7cb558d1cab62a81a1c91368648d12 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:38:22 -0500 Subject: [PATCH 05/14] Migrate ORC Writer to pylibcudf (#17310) Apart of #15162. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17310 --- python/cudf/cudf/_lib/orc.pyx | 167 ++++--- python/pylibcudf/pylibcudf/io/orc.pxd | 65 ++- python/pylibcudf/pylibcudf/io/orc.pyi | 51 ++- python/pylibcudf/pylibcudf/io/orc.pyx | 413 +++++++++++++++++- python/pylibcudf/pylibcudf/io/types.pxd | 17 +- python/pylibcudf/pylibcudf/io/types.pyi | 22 +- python/pylibcudf/pylibcudf/io/types.pyx | 61 ++- .../pylibcudf/pylibcudf/tests/io/test_orc.py | 62 +++ .../pylibcudf/tests/io/test_types.py | 28 ++ 9 files changed, 762 insertions(+), 124 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_types.py diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 32a5e463916..c829cac6409 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -3,11 +3,9 @@ from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map -from libcpp.memory cimport unique_ptr from libcpp.string cimport string -from libcpp.utility cimport move from libcpp.vector cimport vector - +import itertools from collections import OrderedDict try: @@ -16,23 +14,10 @@ except ImportError: import json cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.orc cimport ( - chunked_orc_writer_options, - orc_chunked_writer, - orc_writer_options, - write_orc as libcudf_write_orc, -) -from pylibcudf.libcudf.io.types cimport ( - column_in_metadata, - sink_info, - table_input_metadata, -) -from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table +from cudf._lib.io.utils cimport update_col_struct_field_names +from cudf._lib.utils cimport data_from_pylibcudf_io import pylibcudf as plc @@ -40,7 +25,8 @@ import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.core.buffer import acquire_spill_lock - +from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata +from pylibcudf.io.orc cimport OrcChunkedWriter # TODO: Consider inlining this function since it seems to only be used in one place. cpdef read_parsed_orc_statistics(filepath_or_buffer): @@ -246,36 +232,33 @@ def write_orc( -------- cudf.read_orc """ - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - cdef table_input_metadata tbl_meta - cdef map[string, string] user_data - user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata( - table, index) - ) - + user_data = {} + user_data["pandas"] = generate_pandas_metadata(table, index) if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) + columns = table._columns if table._index is None else [ + *table.index._columns, *table._columns + ] + plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) + tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) + _index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table._index.names) else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = TableInputMetadata(plc_table) num_index_cols_meta = 0 if cols_as_map_type is not None: cols_as_map_type = set(cols_as_map_type) for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name.encode()) + tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, tbl_meta.column_metadata[i], @@ -283,24 +266,24 @@ def write_orc( and (name in cols_as_map_type), ) - cdef orc_writer_options c_orc_writer_options = move( - orc_writer_options.builder( - sink_info_c, tv - ).metadata(tbl_meta) - .key_value_metadata(move(user_data)) + options = ( + plc.io.orc.OrcWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) if stripe_size_bytes is not None: - c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes) + options.set_stripe_size_bytes(stripe_size_bytes) if stripe_size_rows is not None: - c_orc_writer_options.set_stripe_size_rows(stripe_size_rows) + options.set_stripe_size_rows(stripe_size_rows) if row_index_stride is not None: - c_orc_writer_options.set_row_index_stride(row_index_stride) + options.set_row_index_stride(row_index_stride) - with nogil: - libcudf_write_orc(c_orc_writer_options) + plc.io.orc.write_orc(options) cdef int64_t get_skiprows_arg(object arg) except*: @@ -326,13 +309,12 @@ cdef class ORCWriter: cudf.io.orc.to_orc """ cdef bool initialized - cdef unique_ptr[orc_chunked_writer] writer - cdef sink_info sink - cdef unique_ptr[data_sink] _data_sink + cdef OrcChunkedWriter writer + cdef SinkInfo sink cdef str statistics cdef object compression cdef object index - cdef table_input_metadata tbl_meta + cdef TableInputMetadata tbl_meta cdef object cols_as_map_type cdef object stripe_size_bytes cdef object stripe_size_rows @@ -347,8 +329,7 @@ cdef class ORCWriter: object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): - - self.sink = make_sink_info(path, self._data_sink) + self.sink = plc.io.SinkInfo([path]) self.statistics = statistics self.compression = compression self.index = index @@ -368,17 +349,21 @@ cdef class ORCWriter: table._index.name is not None or isinstance(table._index, cudf.core.multiindex.MultiIndex) ) - tv = table_view_from_table(table, not keep_index) + if keep_index: + columns = [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + else: + columns = [col.to_pylibcudf(mode="read") for col in table._columns] - with nogil: - self.writer.get()[0].write(tv) + self.writer.write(plc.Table(columns)) def close(self): if not self.initialized: return - with nogil: - self.writer.get()[0].close() + self.writer.close() def __dealloc__(self): self.close() @@ -387,32 +372,47 @@ cdef class ORCWriter: """ Prepare all the values required to build the chunked_orc_writer_options anb creates a writer""" - cdef table_view tv num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True), + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in table._columns + ] ) + self.tbl_meta = TableInputMetadata(plc_table) if self.index is not False: if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) + idx_name ) num_index_cols_meta = len(table._index.names) else: if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) self.tbl_meta.column_metadata[0].set_name( - str.encode(table._index.name) + table._index.name ) num_index_cols_meta = 1 for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) + self.tbl_meta.column_metadata[i].set_name(name) _set_col_children_metadata( table[name]._column, self.tbl_meta.column_metadata[i], @@ -420,38 +420,37 @@ cdef class ORCWriter: and (name in self.cols_as_map_type), ) - cdef map[string, string] user_data + user_data = {} pandas_metadata = generate_pandas_metadata(table, self.index) - user_data[str.encode("pandas")] = str.encode(pandas_metadata) - - cdef chunked_orc_writer_options c_opts = move( - chunked_orc_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(_get_comp_type(self.compression)) - .enable_statistics(_get_orc_stat_freq(self.statistics)) - .build() - ) + user_data["pandas"] = pandas_metadata + + options = ( + plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) + .build() + ) if self.stripe_size_bytes is not None: - c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + options.set_stripe_size_bytes(self.stripe_size_bytes) if self.stripe_size_rows is not None: - c_opts.set_stripe_size_rows(self.stripe_size_rows) + options.set_stripe_size_rows(self.stripe_size_rows) if self.row_index_stride is not None: - c_opts.set_row_index_stride(self.row_index_stride) + options.set_row_index_stride(self.row_index_stride) - with nogil: - self.writer.reset(new orc_chunked_writer(c_opts)) + self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) self.initialized = True cdef _set_col_children_metadata(Column col, - column_in_metadata& col_meta, + ColumnInMetadata col_meta, list_column_as_map=False): if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): - col_meta.child(i).set_name(name.encode()) + col_meta.child(i).set_name(name) _set_col_children_metadata( child_col, col_meta.child(i), list_column_as_map ) diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd index b111d617b1b..671f0692444 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -4,15 +4,33 @@ from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from libcpp.memory cimport unique_ptr +from libcpp.map cimport map +from pylibcudf.io.types cimport ( + SourceInfo, + SinkInfo, + TableWithMetadata, + TableInputMetadata, +) from pylibcudf.libcudf.io.orc_metadata cimport ( column_statistics, parsed_orc_statistics, statistics_type, ) +from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, + orc_writer_options, + orc_writer_options_builder, + chunked_orc_writer_options, + chunked_orc_writer_options_builder, +) from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType - +from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) cpdef TableWithMetadata read_orc( SourceInfo source_info, @@ -48,3 +66,46 @@ cdef class ParsedOrcStatistics: cpdef ParsedOrcStatistics read_parsed_orc_statistics( SourceInfo source_info ) + +cdef class OrcWriterOptions: + cdef orc_writer_options c_obj + cdef Table table + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) + +cdef class OrcWriterOptionsBuilder: + cdef orc_writer_options_builder c_obj + cdef Table table + cdef SinkInfo sink + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm) + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) + cpdef OrcWriterOptions build(self) + +cpdef void write_orc(OrcWriterOptions options) + +cdef class OrcChunkedWriter: + cdef unique_ptr[orc_chunked_writer] c_obj + cpdef void close(self) + cpdef void write(self, Table table) + +cdef class ChunkedOrcWriterOptions: + cdef chunked_orc_writer_options c_obj + cdef SinkInfo sink + cpdef void set_stripe_size_bytes(self, size_t size_bytes) + cpdef void set_stripe_size_rows(self, size_type size_rows) + cpdef void set_row_index_stride(self, size_type stride) + +cdef class ChunkedOrcWriterOptionsBuilder: + cdef chunked_orc_writer_options_builder c_obj + cdef SinkInfo sink + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp) + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val) + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, dict kvm + ) + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta) + cpdef ChunkedOrcWriterOptions build(self) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi index 4cf87f1a832..516f97981e9 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyi +++ b/python/pylibcudf/pylibcudf/io/orc.pyi @@ -1,8 +1,16 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from typing import Any +from typing import Any, Self -from pylibcudf.io.types import SourceInfo, TableWithMetadata +from pylibcudf.io.types import ( + CompressionType, + SinkInfo, + SourceInfo, + StatisticsFreq, + TableInputMetadata, + TableWithMetadata, +) +from pylibcudf.table import Table from pylibcudf.types import DataType def read_orc( @@ -39,3 +47,42 @@ class ParsedOrcStatistics: def read_parsed_orc_statistics( source_info: SourceInfo, ) -> ParsedOrcStatistics: ... + +class OrcWriterOptions: + def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... + @staticmethod + def builder(sink: SinkInfo, table: Table) -> OrcWriterOptionsBuilder: ... + +class OrcWriterOptionsBuilder: + def __init__(self): ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableWithMetadata) -> Self: ... + def build(self) -> OrcWriterOptions: ... + +def write_orc(options: OrcWriterOptions) -> None: ... + +class OrcChunkedWriter: + def __init__(self): ... + def close(self) -> None: ... + def write(self, table: Table) -> None: ... + +class ChunkedOrcWriterOptions: + def __init__(self): ... + def set_stripe_size_bytes(self, size_bytes: int) -> None: ... + def set_stripe_size_rows(self, size_rows: int) -> None: ... + def set_row_index_stride(self, stride: int) -> None: ... + @staticmethod + def builder(sink: SinkInfo) -> ChunkedOrcWriterOptionsBuilder: ... + +class ChunkedOrcWriterOptionsBuilder: + def __init__(self): ... + def compression(self, comp: CompressionType) -> Self: ... + def enable_statistics(self, val: StatisticsFreq) -> Self: ... + def key_value_metadata(self, kvm: dict[str, str]) -> Self: ... + def metadata(self, meta: TableInputMetadata) -> Self: ... + def build(self) -> ChunkedOrcWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 4270f5b4f95..63eab4a9634 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -6,10 +6,11 @@ from libcpp.vector cimport vector import datetime -from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata, SinkInfo from pylibcudf.libcudf.io.orc cimport ( orc_reader_options, read_orc as cpp_read_orc, + write_orc as cpp_write_orc, ) from pylibcudf.libcudf.io.orc_metadata cimport ( binary_statistics, @@ -29,12 +30,27 @@ from pylibcudf.libcudf.io.types cimport table_with_metadata from pylibcudf.libcudf.types cimport size_type from pylibcudf.types cimport DataType from pylibcudf.variant cimport get_if, holds_alternative +from pylibcudf.libcudf.io.types cimport ( + compression_type, + statistics_freq, +) +from pylibcudf.libcudf.io.orc cimport ( + orc_chunked_writer, + orc_writer_options, + chunked_orc_writer_options, +) __all__ = [ "OrcColumnStatistics", "ParsedOrcStatistics", "read_orc", "read_parsed_orc_statistics", + "write_orc", + "OrcWriterOptions", + "OrcWriterOptionsBuilder", + "OrcChunkedWriter", + "ChunkedOrcWriterOptions", + "ChunkedOrcWriterOptionsBuilder", ] cdef class OrcColumnStatistics: @@ -310,3 +326,398 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics( cpp_read_parsed_orc_statistics(source_info.c_obj) ) return ParsedOrcStatistics.from_libcudf(parsed) + + +cdef class OrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_bytes` + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_stripe_size_rows` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + For details, see :cpp:func:`cudf::io::orc_writer_options::set_row_index_stride` + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink, Table table): + """ + Create builder to create OrcWriterOptions. + + For details, see :cpp:func:`cudf::io::orc_writer_options::builder` + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + OrcWriterOptionsBuilder + """ + cdef OrcWriterOptionsBuilder orc_builder = OrcWriterOptionsBuilder.__new__( + OrcWriterOptionsBuilder + ) + orc_builder.c_obj = orc_writer_options.builder(sink.c_obj, table.view()) + orc_builder.table = table + orc_builder.sink = sink + return orc_builder + + +cdef class OrcWriterOptionsBuilder: + cpdef OrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::compression` + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.compression(comp) + return self + + cpdef OrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + For details, see :cpp:func:`enable_statistics` + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.enable_statistics(val) + return self + + cpdef OrcWriterOptionsBuilder key_value_metadata(self, dict kvm): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) + return self + + cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + For details, see :cpp:func:`cudf::io::orc_writer_options_builder::metadata` + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + OrcWriterOptionsBuilder + """ + self.c_obj.metadata(meta.c_obj) + return self + + cpdef OrcWriterOptions build(self): + """Moves the ORC writer options builder""" + cdef OrcWriterOptions orc_options = OrcWriterOptions.__new__( + OrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.table = self.table + orc_options.sink = self.sink + return orc_options + + +cpdef void write_orc(OrcWriterOptions options): + """ + Write to ORC format. + + The table to write, output paths, and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`write_csv`. + + Parameters + ---------- + options: OrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + None + """ + with nogil: + cpp_write_orc(move(options.c_obj)) + + +cdef class OrcChunkedWriter: + cpdef void close(self): + """ + Closes the chunked ORC writer. + + Returns + ------- + None + """ + with nogil: + self.c_obj.get()[0].close() + + cpdef void write(self, Table table): + """ + Writes table to output. + + Parameters + ---------- + table: Table + able that needs to be written + + Returns + ------- + None + """ + with nogil: + self.c_obj.get()[0].write(table.view()) + + @staticmethod + def from_options(ChunkedOrcWriterOptions options): + """ + Creates a chunked ORC writer from options + + Parameters + ---------- + options: ChunkedOrcWriterOptions + Settings for controlling writing behavior + + Returns + ------- + OrcChunkedWriter + """ + cdef OrcChunkedWriter orc_writer = OrcChunkedWriter.__new__( + OrcChunkedWriter + ) + orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj)) + return orc_writer + + +cdef class ChunkedOrcWriterOptions: + cpdef void set_stripe_size_bytes(self, size_t size_bytes): + """ + Sets the maximum stripe size, in bytes. + + Parameters + ---------- + size_bytes: size_t + Sets the maximum stripe size, in bytes. + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_bytes(size_bytes) + + cpdef void set_stripe_size_rows(self, size_type size_rows): + """ + Sets the maximum stripe size, in rows. + + If the stripe size is smaller that the row group size, + row group size will be reduced to math the stripe size. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_stripe_size_rows(size_rows) + + cpdef void set_row_index_stride(self, size_type stride): + """ + Sets the row index stride. + + Rounded down to a multiple of 8. + + Parameters + ---------- + size_bytes: size_type + Maximum stripe size, in rows to be set + + Returns + ------- + None + """ + self.c_obj.set_row_index_stride(stride) + + @staticmethod + def builder(SinkInfo sink): + """ + Create builder to create ChunkedOrcWriterOptions. + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + table: Table + Table to be written to output + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + cdef ChunkedOrcWriterOptionsBuilder orc_builder = \ + ChunkedOrcWriterOptionsBuilder.__new__( + ChunkedOrcWriterOptionsBuilder + ) + orc_builder.c_obj = chunked_orc_writer_options.builder(sink.c_obj) + orc_builder.sink = sink + return orc_builder + + +cdef class ChunkedOrcWriterOptionsBuilder: + cpdef ChunkedOrcWriterOptionsBuilder compression(self, compression_type comp): + """ + Sets compression type. + + Parameters + ---------- + comp: CompressionType + The compression type to use + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.compression(comp) + return self + + cpdef ChunkedOrcWriterOptionsBuilder enable_statistics(self, statistics_freq val): + """ + Choose granularity of column statistics to be written. + + Parameters + ---------- + val: StatisticsFreq + Level of statistics collection + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.enable_statistics(val) + return self + + cpdef ChunkedOrcWriterOptionsBuilder key_value_metadata( + self, + dict kvm + ): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + kvm: dict + Key-Value footer metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.key_value_metadata( + {key.encode(): value.encode() for key, value in kvm.items()} + ) + return self + + cpdef ChunkedOrcWriterOptionsBuilder metadata(self, TableInputMetadata meta): + """ + Sets associated metadata. + + Parameters + ---------- + meta: TableInputMetadata + Associated metadata + + Returns + ------- + ChunkedOrcWriterOptionsBuilder + """ + self.c_obj.metadata(meta.c_obj) + return self + + cpdef ChunkedOrcWriterOptions build(self): + """Create a OrcWriterOptions object""" + cdef ChunkedOrcWriterOptions orc_options = ChunkedOrcWriterOptions.__new__( + ChunkedOrcWriterOptions + ) + orc_options.c_obj = move(self.c_obj.build()) + orc_options.sink = self.sink + return orc_options diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index 90b43cf0ff5..a1f3b17936c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -3,6 +3,7 @@ from libc.stdint cimport uint8_t, int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector +from libcpp cimport bool from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.types cimport ( column_encoding, @@ -22,16 +23,16 @@ from pylibcudf.libcudf.io.types cimport ( ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.table cimport Table - +from pylibcudf.libcudf.types cimport size_type cdef class PartitionInfo: cdef partition_info c_obj cdef class ColumnInMetadata: - cdef column_in_metadata c_obj + cdef column_in_metadata* c_obj + cdef TableInputMetadata owner - @staticmethod - cdef ColumnInMetadata from_metadata(column_in_metadata metadata) + cdef TableInputMetadata table cpdef ColumnInMetadata set_name(self, str name) @@ -43,7 +44,7 @@ cdef class ColumnInMetadata: cpdef ColumnInMetadata set_int96_timestamps(self, bool req) - cpdef ColumnInMetadata set_decimal_precision(self, uint8_t req) + cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision) cpdef ColumnInMetadata child(self, size_type i) @@ -57,8 +58,14 @@ cdef class ColumnInMetadata: cpdef str get_name(self) + @staticmethod + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ) + cdef class TableInputMetadata: cdef table_input_metadata c_obj + cdef list column_metadata cdef class TableWithMetadata: cdef public Table tbl diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index 04f276cfeee..a3a559219ff 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -3,7 +3,7 @@ import io import os from collections.abc import Mapping from enum import IntEnum -from typing import Any, Literal, TypeAlias, overload +from typing import Any, Literal, Self, TypeAlias, overload from pylibcudf.column import Column from pylibcudf.io.datasource import Datasource @@ -66,16 +66,16 @@ class TableInputMetadata: def __init__(self, table: Table): ... class ColumnInMetadata: - def set_name(self, name: str) -> ColumnInMetadata: ... - def set_nullability(self, nullable: bool) -> ColumnInMetadata: ... - def set_list_column_as_map(self) -> ColumnInMetadata: ... - def set_int96_timestamps(self, req: bool) -> ColumnInMetadata: ... - def set_decimal_precision(self, precision: int) -> ColumnInMetadata: ... - def child(self, i: int) -> ColumnInMetadata: ... - def set_output_as_binary(self, binary: bool) -> ColumnInMetadata: ... - def set_type_length(self, type_length: int) -> ColumnInMetadata: ... - def set_skip_compression(self, skip: bool) -> ColumnInMetadata: ... - def set_encoding(self, encoding: ColumnEncoding) -> ColumnInMetadata: ... + def set_name(self, name: str) -> Self: ... + def set_nullability(self, nullable: bool) -> Self: ... + def set_list_column_as_map(self) -> Self: ... + def set_int96_timestamps(self, req: bool) -> Self: ... + def set_decimal_precision(self, precision: int) -> Self: ... + def child(self, i: int) -> Self: ... + def set_output_as_binary(self, binary: bool) -> Self: ... + def set_type_length(self, type_length: int) -> Self: ... + def set_skip_compression(self, skip: bool) -> Self: ... + def set_encoding(self, encoding: ColumnEncoding) -> Self: ... def get_name(self) -> str: ... class TableWithMetadata: diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 460ab6844c3..a2155829f2c 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -2,7 +2,6 @@ from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory -from libc.stdint cimport uint8_t, int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -20,6 +19,8 @@ from pylibcudf.libcudf.io.types cimport ( source_info, table_input_metadata, table_with_metadata, + column_in_metadata, + table_input_metadata, ) from pylibcudf.libcudf.types cimport size_type @@ -38,9 +39,14 @@ from pylibcudf.libcudf.io.types import ( quote_style as QuoteStyle, # no-cython-lint statistics_freq as StatisticsFreq, # no-cython-lint ) +from cython.operator cimport dereference +from pylibcudf.libcudf.types cimport size_type +from cython.operator cimport dereference +from pylibcudf.libcudf.types cimport size_type __all__ = [ "ColumnEncoding", + "ColumnInMetadata", "CompressionType", "DictionaryPolicy", "JSONRecoveryMode", @@ -74,18 +80,30 @@ cdef class ColumnInMetadata: Metadata for a column """ + def __init__(self): + raise ValueError( + "ColumnInMetadata should not be constructed directly. " + "Use one of the factories." + ) + @staticmethod - cdef ColumnInMetadata from_metadata(column_in_metadata metadata): + cdef ColumnInMetadata from_libcudf( + column_in_metadata* metadata, TableInputMetadata owner + ): """ - Construct a ColumnInMetadata. + A Python representation of `column_in_metadata`. Parameters ---------- - metadata : column_in_metadata - """ - cdef ColumnInMetadata col_metadata = ColumnInMetadata.__new__(ColumnInMetadata) - col_metadata.c_obj = metadata - return col_metadata + metadata : column_in_metadata* + Raw pointer to C++ metadata. + owner : TableInputMetadata + Owning table input metadata that manages lifetime of the raw pointer. + """ + cdef ColumnInMetadata out = ColumnInMetadata.__new__(ColumnInMetadata) + out.c_obj = metadata + out.owner = owner + return out cpdef ColumnInMetadata set_name(self, str name): """ @@ -100,7 +118,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_name(name.encode()) + dereference(self.c_obj).set_name(name.encode()) return self cpdef ColumnInMetadata set_nullability(self, bool nullable): @@ -116,7 +134,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_nullability(nullable) + dereference(self.c_obj).set_nullability(nullable) return self cpdef ColumnInMetadata set_list_column_as_map(self): @@ -128,7 +146,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_list_column_as_map() + dereference(self.c_obj).set_list_column_as_map() return self cpdef ColumnInMetadata set_int96_timestamps(self, bool req): @@ -145,7 +163,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_int96_timestamps(req) + dereference(self.c_obj).set_int96_timestamps(req) return self cpdef ColumnInMetadata set_decimal_precision(self, uint8_t precision): @@ -162,7 +180,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_decimal_precision(precision) + dereference(self.c_obj).set_decimal_precision(precision) return self cpdef ColumnInMetadata child(self, size_type i): @@ -178,7 +196,8 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - return ColumnInMetadata.from_metadata(self.c_obj.child(i)) + cdef column_in_metadata* child_c_obj = &dereference(self.c_obj).child(i) + return ColumnInMetadata.from_libcudf(child_c_obj, self.owner) cpdef ColumnInMetadata set_output_as_binary(self, bool binary): """ @@ -193,7 +212,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_output_as_binary(binary) + dereference(self.c_obj).set_output_as_binary(binary) return self cpdef ColumnInMetadata set_type_length(self, int32_t type_length): @@ -209,7 +228,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_type_length(type_length) + dereference(self.c_obj).set_type_length(type_length) return self cpdef ColumnInMetadata set_skip_compression(self, bool skip): @@ -226,7 +245,7 @@ cdef class ColumnInMetadata: ------- Self """ - self.c_obj.set_skip_compression(skip) + dereference(self.c_obj).set_skip_compression(skip) return self cpdef ColumnInMetadata set_encoding(self, column_encoding encoding): @@ -243,7 +262,7 @@ cdef class ColumnInMetadata: ------- ColumnInMetadata """ - self.c_obj.set_encoding(encoding) + dereference(self.c_obj).set_encoding(encoding) return self cpdef str get_name(self): @@ -255,7 +274,7 @@ cdef class ColumnInMetadata: str The name of this column """ - return self.c_obj.get_name().decode() + return dereference(self.c_obj).get_name().decode() cdef class TableInputMetadata: @@ -269,6 +288,10 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) + self.column_metadata = [ + ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self) + for i in range(self.c_obj.column_metadata.size()) + ] cdef class TableWithMetadata: diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 5ed660ba6cf..2557e40c935 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. + import pyarrow as pa import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source @@ -52,3 +53,64 @@ def test_read_orc_basic( ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) + + +@pytest.mark.parametrize( + "compression", + [ + plc.io.types.CompressionType.NONE, + plc.io.types.CompressionType.SNAPPY, + ], +) +@pytest.mark.parametrize( + "statistics", + [ + plc.io.types.StatisticsFreq.STATISTICS_NONE, + plc.io.types.StatisticsFreq.STATISTICS_COLUMN, + ], +) +@pytest.mark.parametrize("stripe_size_bytes", [None, 65536]) +@pytest.mark.parametrize("stripe_size_rows", [None, 512]) +@pytest.mark.parametrize("row_index_stride", [None, 512]) +def test_roundtrip_pa_table( + compression, + statistics, + stripe_size_bytes, + stripe_size_rows, + row_index_stride, + tmp_path, +): + pa_table = pa.table({"a": [1.0, 2.0, None], "b": [True, None, False]}) + plc_table = plc.interop.from_arrow(pa_table) + + tmpfile_name = tmp_path / "test.orc" + + sink = plc.io.SinkInfo([str(tmpfile_name)]) + + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + user_data = {"a": "", "b": ""} + options = ( + plc.io.orc.OrcWriterOptions.builder(sink, plc_table) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(compression) + .enable_statistics(statistics) + .build() + ) + if stripe_size_bytes is not None: + options.set_stripe_size_bytes(stripe_size_bytes) + if stripe_size_rows is not None: + options.set_stripe_size_rows(stripe_size_rows) + if row_index_stride is not None: + options.set_row_index_stride(row_index_stride) + + plc.io.orc.write_orc(options) + + read_table = pa.orc.read_table(str(tmpfile_name)) + + res = plc.io.types.TableWithMetadata( + plc.interop.from_arrow(read_table), + [(name, []) for name in pa_table.schema.names], + ) + + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_types.py b/python/pylibcudf/pylibcudf/tests/io/test_types.py new file mode 100644 index 00000000000..a7642556bf2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_types.py @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import gc +import weakref + +import pyarrow as pa + +import pylibcudf as plc + + +def test_gc_with_table_and_column_input_metadata(): + class Foo(plc.io.types.TableInputMetadata): + def __del__(self): + pass + + pa_table = pa.table( + {"a": pa.array([1, 2, 3]), "b": pa.array(["a", "b", "c"])} + ) + plc_table = plc.interop.from_arrow(pa_table) + + tbl_meta = Foo(plc_table) + weak_tbl_meta = weakref.ref(tbl_meta) + + del tbl_meta + + gc.collect() + + assert weak_tbl_meta() is None From 776ef54968073e808518b6be2b525de5c28f8070 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Nov 2024 11:36:29 -0800 Subject: [PATCH 06/14] Remove cudf._lib.join in favor of inlining pylibcudf (#17371) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17371 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/join.pyx | 43 ----------------- python/cudf/cudf/core/column/column.py | 12 +++-- python/cudf/cudf/core/groupby/groupby.py | 19 ++++++-- python/cudf/cudf/core/index.py | 12 ++++- python/cudf/cudf/core/join/join.py | 60 ++++++++++++++++++------ python/cudf/cudf/core/multiindex.py | 19 ++++++-- 8 files changed, 95 insertions(+), 72 deletions(-) delete mode 100644 python/cudf/cudf/_lib/join.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 61d3bcbe24e..45e0fc345b5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -22,7 +22,6 @@ set(cython_sources filling.pyx groupby.pyx interop.pyx - join.pyx json.pyx merge.pyx null_mask.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index efa437eebb7..c51db601985 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -9,7 +9,6 @@ filling, groupby, interop, - join, json, merge, null_mask, diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx deleted file mode 100644 index 2559358c21f..00000000000 --- a/python/cudf/cudf/_lib/join.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - -# The functions below return the *gathermaps* that represent -# the join result when joining on the keys `lhs` and `rhs`. - - -@acquire_spill_lock() -def join(list lhs, list rhs, how=None): - if how == "outer": - how = "full" - if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None: - raise ValueError(f"Invalid join type {how}") - - left_rows, right_rows = join_func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), - pylibcudf.types.NullEquality.EQUAL - ) - return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows) - - -@acquire_spill_lock() -def semi_join(list lhs, list rhs, how=None): - if ( - join_func := getattr( - pylibcudf.join, f"{how.replace('left', 'left_')}_join", None - ) - ) is None: - raise ValueError(f"Invalid join type {how}") - - return Column.from_pylibcudf( - join_func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), - pylibcudf.types.NullEquality.EQUAL - ) - ), None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 53946be1c49..f0df4a3c1b3 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1424,8 +1424,6 @@ def _label_encoding( ] dtype: int8 """ - from cudf._lib.join import join as cpp_join - if na_sentinel is None or na_sentinel.value is cudf.NA: na_sentinel = cudf.Scalar(-1) @@ -1447,15 +1445,21 @@ def _return_sentinel_column(): except ValueError: return _return_sentinel_column() - left_gather_map, right_gather_map = cpp_join( - [self], [cats], how="left" + left_rows, right_rows = plc.join.left_join( + plc.Table([self.to_pylibcudf(mode="read")]), + plc.Table([cats.to_pylibcudf(mode="read")]), + plc.types.NullEquality.EQUAL, ) + left_gather_map = type(self).from_pylibcudf(left_rows) + right_gather_map = type(self).from_pylibcudf(right_rows) + codes = libcudf.copying.gather( [as_column(range(len(cats)), dtype=dtype)], right_gather_map, nullify=True, ) del right_gather_map + del right_rows # reorder `codes` so that its values correspond to the # values of `self`: (codes,) = libcudf.sort.sort_by_key( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b274bdea76d..315324c130c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -772,9 +772,22 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): join_keys = map(list, zip(*join_keys)) # By construction, left and right keys are related by # a permutation, so we can use an inner join. - left_order, right_order = libcudf.join.join( - *join_keys, how="inner" - ) + with acquire_spill_lock(): + plc_tables = [ + plc.Table( + [col.to_pylibcudf(mode="read") for col in cols] + ) + for cols in join_keys + ] + left_plc, right_plc = plc.join.inner_join( + plc_tables[0], + plc_tables[1], + plc.types.NullEquality.EQUAL, + ) + left_order = libcudf.column.Column.from_pylibcudf(left_plc) + right_order = libcudf.column.Column.from_pylibcudf( + right_plc + ) # left order is some permutation of the ordering we # want, and right order is a matching gather map for # the result table. Get the correct order by sorting diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0a2b15a16b9..80e037c36fd 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -16,6 +16,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf._lib.filling import sequence @@ -32,6 +34,7 @@ from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals.search import search_sorted +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -1360,7 +1363,14 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): except ValueError: return _return_get_indexer_result(result.values) - scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") + with acquire_spill_lock(): + left_plc, right_plc = plc.join.inner_join( + plc.Table([lcol.to_pylibcudf(mode="read")]), + plc.Table([rcol.to_pylibcudf(mode="read")]), + plc.types.NullEquality.EQUAL, + ) + scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) + indices = libcudf.column.Column.from_pylibcudf(right_plc) result = libcudf.copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index cfeaca00888..5c224176730 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,11 +2,14 @@ from __future__ import annotations import itertools -from typing import Any, ClassVar +from typing import Any + +import pylibcudf as plc import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype +from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( _coerce_to_tuple, @@ -17,19 +20,26 @@ class Merge: - # The joiner function must have the following signature: - # - # def joiner( - # lhs: Frame, - # rhs: Frame - # ) -> Tuple[Optional[Column], Optional[Column]]: - # ... - # - # where `lhs` and `rhs` are Frames composed of the left and right - # join key. The `joiner` returns a tuple of two Columns - # representing the rows to gather from the left- and right- side - # tables respectively. - _joiner: ClassVar[staticmethod] = staticmethod(libcudf.join.join) + @staticmethod + @acquire_spill_lock() + def _joiner( + lhs: list[libcudf.column.Column], + rhs: list[libcudf.column.Column], + how: str, + ) -> tuple[libcudf.column.Column, libcudf.column.Column]: + if how == "outer": + how = "full" + if (join_func := getattr(plc.join, f"{how}_join", None)) is None: + raise ValueError(f"Invalid join type {how}") + + left_rows, right_rows = join_func( + plc.Table([col.to_pylibcudf(mode="read") for col in lhs]), + plc.Table([col.to_pylibcudf(mode="read") for col in rhs]), + plc.types.NullEquality.EQUAL, + ) + return libcudf.column.Column.from_pylibcudf( + left_rows + ), libcudf.column.Column.from_pylibcudf(right_rows) def __init__( self, @@ -546,7 +556,27 @@ def _validate_merge_params( class MergeSemi(Merge): - _joiner: ClassVar[staticmethod] = staticmethod(libcudf.join.semi_join) + @staticmethod + @acquire_spill_lock() + def _joiner( + lhs: list[libcudf.column.Column], + rhs: list[libcudf.column.Column], + how: str, + ) -> tuple[libcudf.column.Column, None]: + if ( + join_func := getattr( + plc.join, f"{how.replace('left', 'left_')}_join", None + ) + ) is None: + raise ValueError(f"Invalid join type {how}") + + return libcudf.column.Column.from_pylibcudf( + join_func( + plc.Table([col.to_pylibcudf(mode="read") for col in lhs]), + plc.Table([col.to_pylibcudf(mode="read") for col in rhs]), + plc.types.NullEquality.EQUAL, + ) + ), None def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame): # semi-join result includes only lhs columns diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bfff62f0a89..19a53af018d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -14,6 +14,8 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf import cudf._lib as libcudf from cudf._lib.types import size_type_dtype @@ -22,6 +24,7 @@ from cudf.core import column from cudf.core._base_index import _return_get_indexer_result from cudf.core.algorithms import factorize +from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import ( @@ -1919,10 +1922,18 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) - scatter_map, indices = libcudf.join.join( - *join_keys, - how="inner", - ) + with acquire_spill_lock(): + plc_tables = [ + plc.Table([col.to_pylibcudf(mode="read") for col in cols]) + for cols in join_keys + ] + left_plc, right_plc = plc.join.inner_join( + plc_tables[0], + plc_tables[1], + plc.types.NullEquality.EQUAL, + ) + scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) + indices = libcudf.column.Column.from_pylibcudf(right_plc) result = libcudf.copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) From d7141739504521d7a7db6fcd87305e29392734b3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 26 Nov 2024 13:23:19 -0800 Subject: [PATCH 07/14] Remove unused type aliases (#17396) This PR eliminates unused type aliases in join details. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17396 --- cpp/src/join/join_common_utils.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 573101cefd9..3645b3333b3 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -50,11 +50,6 @@ using mixed_multimap_type = cudf::detail::cuco_allocator, cuco::legacy::double_hashing<1, hash_type, hash_type>>; -using row_hash_legacy = - cudf::row_hasher; - -using row_equality_legacy = cudf::row_equality_comparator; - bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); } // namespace detail } // namespace cudf From fa62ff45eddd8256f0a3e8cebf077970dd70cb67 Mon Sep 17 00:00:00 2001 From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:28:02 -0600 Subject: [PATCH 08/14] Add breaking change workflow trigger (#17248) Adds a workflow that triggers a second workflow which sends a notification to a designated Slack channel on every PR labelled with breaking, whenever any of the following events are triggered on the PR: - closed - reopened - labeled - unlabeled Depends on https://github.com/rapidsai/shared-workflows/pull/257 --- .../trigger-breaking-change-alert.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/trigger-breaking-change-alert.yaml diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml new file mode 100644 index 00000000000..3b972f31ca4 --- /dev/null +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -0,0 +1,26 @@ +name: Trigger Breaking Change Notifications + +on: + pull_request_target: + types: + - closed + - reopened + - labeled + - unlabeled + +jobs: + trigger-notifier: + if: contains(github.event.pull_request.labels.*.name, 'breaking') + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + with: + sender_login: ${{ github.event.sender.login }} + sender_avatar: ${{ github.event.sender.avatar_url }} + repo: ${{ github.repository }} + pr_number: ${{ github.event.pull_request.number }} + pr_title: "${{ github.event.pull_request.title }}" + pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}" + pr_base_ref: ${{ github.event.pull_request.base.ref }} + pr_author: ${{ github.event.pull_request.user.login }} + event_action: ${{ github.event.action }} + pr_merged: ${{ github.event.pull_request.merged }} From 797a07b00a048396b1ee1d8bc6d94ecc0ea43ea7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 26 Nov 2024 16:30:18 -0600 Subject: [PATCH 09/14] Require approval to run CI on draft PRs (#17450) By default, CI runs on draft PRs. This leads to many CI runs that may be unnecessary. With this PR's change to `.github/copy-pr-bot.yaml`, an `/ok to test` comment from a trusted user is required to trigger CI on draft PRs. Non-draft PRs will run CI by default, assuming that all commits are signed by trusted users. Otherwise an `/ok to test` is required (as before) -- see the `copy-pr-bot` docs at https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ for more information. Part of https://github.com/rapidsai/build-planning/issues/123. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17450 --- .github/copy-pr-bot.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 895ba83ee54..e0ea775aad5 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -2,3 +2,4 @@ # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ enabled: true +auto_sync_draft: false From 45330858e09bef97146a0673106bef93b9c9602c Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 26 Nov 2024 19:26:43 -0500 Subject: [PATCH 10/14] Add CSV Reader options classes to pylibcudf (#17412) This PR adds the CSV reader options classes to pylibcudf and plumbs the changes through cudf python. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17412 --- python/cudf/cudf/_lib/csv.pyx | 99 +- python/cudf_polars/cudf_polars/dsl/ir.py | 37 +- python/pylibcudf/pylibcudf/io/csv.pxd | 56 +- python/pylibcudf/pylibcudf/io/csv.pyi | 63 +- python/pylibcudf/pylibcudf/io/csv.pyx | 842 +++++++++++++----- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 88 +- 6 files changed, 853 insertions(+), 332 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 59a970263e0..641fc18c203 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -202,46 +202,71 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) - lineterminator = str(lineterminator) + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - plc.io.csv.read_csv( - plc.io.SourceInfo([datasource]), - lineterminator=lineterminator, - quotechar = quotechar, - quoting = quoting, - doublequote = doublequote, - header = header, - mangle_dupe_cols = mangle_dupe_cols, - usecols = usecols, - delimiter = delimiter, - delim_whitespace = delim_whitespace, - skipinitialspace = skipinitialspace, - col_names = names, - dtypes = new_dtypes, - skipfooter = skipfooter, - skiprows = skiprows, - dayfirst = dayfirst, - compression = c_compression, - thousands = thousands, - decimal = decimal, - true_values = true_values, - false_values = false_values, - nrows = nrows if nrows is not None else -1, - byte_range_offset = byte_range[0], - byte_range_size = byte_range[1], - skip_blank_lines = skip_blank_lines, - parse_dates = parse_dates, - parse_hex = hex_cols, - comment = comment, - na_values = na_values, - keep_default_na = keep_default_na, - na_filter = na_filter, - prefix = prefix, - ) - ) + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) if dtype is not None: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 6617b71be81..e8d9691f2a0 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -476,23 +476,28 @@ def do_evaluate( with path.open() as f: while f.readline() == "\n": skiprows += 1 - tbl_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([path]), - delimiter=sep, - quotechar=quote, - lineterminator=eol, - col_names=column_names, - header=header, - usecols=usecols, - na_filter=True, - na_values=null_values, - keep_default_na=False, - skiprows=skiprows, - comment=comment, - decimal=decimal, - dtypes=schema, - nrows=n_rows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path])) + .nrows(n_rows) + .skiprows(skiprows) + .lineterminator(str(eol)) + .quotechar(str(quote)) + .decimal(decimal) + .keep_default_na(keep_default_na=False) + .na_filter(na_filter=True) + .build() ) + options.set_delimiter(str(sep)) + if column_names is not None: + options.set_names([str(name) for name in column_names]) + options.set_header(header) + options.set_dtypes(schema) + if usecols is not None: + options.set_use_cols_names([str(name) for name in usecols]) + options.set_na_values(null_values) + if comment is not None: + options.set_comment(comment) + tbl_w_meta = plc.io.csv.read_csv(options) pieces.append(tbl_w_meta) if read_partial: n_rows -= tbl_w_meta.tbl.num_rows() diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index f04edaa316a..95f3ff4fe45 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -6,11 +6,63 @@ from libcpp cimport bool from pylibcudf.libcudf.io.csv cimport ( csv_writer_options, csv_writer_options_builder, + csv_reader_options, + csv_reader_options_builder, ) -from pylibcudf.libcudf.io.types cimport quote_style -from pylibcudf.io.types cimport SinkInfo +from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + quote_style, + table_with_metadata, +) +from pylibcudf.libcudf.types cimport size_type + +cdef class CsvReaderOptions: + cdef csv_reader_options c_obj + cdef SourceInfo source + cpdef void set_header(self, size_type header) + cpdef void set_names(self, list col_names) + cpdef void set_prefix(self, str prefix) + cpdef void set_use_cols_indexes(self, list col_indices) + cpdef void set_use_cols_names(self, list col_names) + cpdef void set_delimiter(self, str delimiter) + cpdef void set_thousands(self, str thousands) + cpdef void set_comment(self, str comment) + cpdef void set_parse_dates(self, list val) + cpdef void set_parse_hex(self, list val) + cpdef void set_dtypes(self, object types) + cpdef void set_true_values(self, list true_values) + cpdef void set_false_values(self, list false_values) + cpdef void set_na_values(self, list na_values) + + +cdef class CsvReaderOptionsBuilder: + cdef csv_reader_options_builder c_obj + cdef SourceInfo source + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression) + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows) + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar) + cpdef CsvReaderOptionsBuilder decimal(self, str decimal) + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) + cpdef CsvReaderOptions build(self) + +cpdef TableWithMetadata read_csv(CsvReaderOptions options) + cdef class CsvWriterOptions: cdef csv_writer_options c_obj cdef Table table diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi index 583b66bc29c..540cbc778ea 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyi +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping +from typing_extensions import Self + from pylibcudf.io.types import ( CompressionType, QuoteStyle, @@ -12,6 +14,47 @@ from pylibcudf.io.types import ( from pylibcudf.table import Table from pylibcudf.types import DataType +class CsvReaderOptions: + def __init__(self): ... + def set_header(self, header: int): ... + def set_names(self, col_names: list[str]): ... + def set_prefix(self, prefix: str): ... + def set_use_cols_indexes(self, col_indices: list[int]): ... + def set_use_cols_names(self, col_names: list[str]): ... + def set_delimiter(self, delimiter: str): ... + def set_thousands(self, thousands: str): ... + def set_comment(self, comment: str): ... + def set_parse_dates(self, val: list[int | str]): ... + def set_parse_hex(self, val: list[int | str]): ... + def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ... + def set_true_values(self, true_values: list[str]): ... + def set_false_values(self, false_values: list[str]): ... + def set_na_values(self, na_values: list[str]): ... + @staticmethod + def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ... + +class CsvReaderOptionsBuilder: + def __init__(self): ... + def compression(self, compression: CompressionType) -> Self: ... + def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def nrows(self, nrows: int) -> Self: ... + def skiprows(self, skiprows: int) -> Self: ... + def skipfooter(self, skipfooter: int) -> Self: ... + def quoting(self, quoting: QuoteStyle) -> Self: ... + def lineterminator(self, lineterminator: str) -> Self: ... + def quotechar(self, quotechar: str) -> Self: ... + def decimal(self, decimal: str) -> Self: ... + def delim_whitespace(self, delim_whitespace: bool) -> Self: ... + def skipinitialspace(self, skipinitialspace: bool) -> Self: ... + def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ... + def doublequote(self, doublequote: bool) -> Self: ... + def keep_default_na(self, keep_default_na: bool) -> Self: ... + def na_filter(self, na_filter: bool) -> Self: ... + def dayfirst(self, dayfirst: bool) -> Self: ... + def build(self) -> CsvReaderOptions: ... + def read_csv( source_info: SourceInfo, *, @@ -54,7 +97,7 @@ def read_csv( # detect_whitespace_around_quotes: bool = False, # timestamp_type: DataType = DataType(type_id.EMPTY), ) -> TableWithMetadata: ... -def write_csv(options: CsvWriterOptionsBuilder) -> None: ... +def write_csv(options: CsvWriterOptionsBuilder): ... class CsvWriterOptions: def __init__(self): ... @@ -63,14 +106,12 @@ class CsvWriterOptions: class CsvWriterOptionsBuilder: def __init__(self): ... - def names(self, names: list) -> CsvWriterOptionsBuilder: ... - def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ... - def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ... - def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ... - def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ... - def inter_column_delimiter( - self, delim: str - ) -> CsvWriterOptionsBuilder: ... - def true_value(self, val: str) -> CsvWriterOptionsBuilder: ... - def false_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def names(self, names: list) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_header(self, val: bool) -> Self: ... + def rows_per_chunk(self, val: int) -> Self: ... + def line_terminator(self, term: str) -> Self: ... + def inter_column_delimiter(self, delim: str) -> Self: ... + def true_value(self, val: str) -> Self: ... + def false_value(self, val: str) -> Self: ... def build(self) -> CsvWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 8be391de2c2..efc9bb813a1 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -28,252 +28,628 @@ __all__ = [ "write_csv", "CsvWriterOptions", "CsvWriterOptionsBuilder", + "CsvReaderOptions", + "CsvReaderOptionsBuilder", ] -cdef tuple _process_parse_dates_hex(list cols): - cdef vector[string] str_cols - cdef vector[int] int_cols - for col in cols: - if isinstance(col, str): - str_cols.push_back(col.encode()) +cdef class CsvReaderOptions: + """The settings to use for ``read_csv`` + For details, see :cpp:class:`cudf::io::csv_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a CsvWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::csv_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the CSV file from. + + Returns + ------- + CsvReaderOptionsBuilder + Builder to build CsvReaderOptions + """ + cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( + CsvReaderOptionsBuilder + ) + csv_builder.c_obj = csv_reader_options.builder(source.c_obj) + csv_builder.source = source + return csv_builder + + cpdef void set_header(self, size_type header): + """ + Sets header row index. + + Parameters + ---------- + header : size_type + Index where header row is located + + Returns + ------- + None + """ + self.c_obj.set_header(header) + + cpdef void set_names(self, list col_names): + """ + Sets names of the column. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_names(vec) + + cpdef void set_prefix(self, str prefix): + """ + Sets prefix to be used for column ID. + + Parameters + ---------- + prefix : str + String used as prefix in for each column name + + Returns + ------- + None + """ + self.c_obj.set_prefix(prefix.encode()) + + cpdef void set_use_cols_indexes(self, list col_indices): + """ + Sets indexes of columns to read. + + Parameters + ---------- + col_indices : list[int] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[int] vec + for i in col_indices: + vec.push_back(i) + self.c_obj.set_use_cols_indexes(vec) + + cpdef void set_use_cols_names(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list[str] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_use_cols_names(vec) + + cpdef void set_delimiter(self, str delimiter): + """ + Sets field delimiter. + + Parameters + ---------- + delimiter : str + A character to indicate delimiter + + Returns + ------- + None + """ + self.c_obj.set_delimiter(ord(delimiter)) + + cpdef void set_thousands(self, str thousands): + """ + Sets numeric data thousands separator. + + Parameters + ---------- + thousands : str + A character that separates thousands + + Returns + ------- + None + """ + self.c_obj.set_thousands(ord(thousands)) + + cpdef void set_comment(self, str comment): + """ + Sets comment line start character. + + Parameters + ---------- + comment : str + A character that indicates comment + + Returns + ------- + None + """ + self.c_obj.set_comment(ord(comment)) + + cpdef void set_parse_dates(self, list val): + """ + Sets indexes or names of columns to read as datetime. + + Parameters + ---------- + val : list[int | str] + List column indices or names to infer as datetime. + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") + else: + for date in val: + if isinstance(date, str): + vec_str.push_back(date.encode()) + else: + vec_int.push_back(date) + self.c_obj.set_parse_dates(vec_str) + self.c_obj.set_parse_dates(vec_int) + + cpdef void set_parse_hex(self, list val): + """ + Sets indexes or names of columns to parse as hexadecimal. + + Parameters + ---------- + val : list[int | str] + List of column indices or names to parse as hexadecimal + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") else: - int_cols.push_back(col) - return str_cols, int_cols - -cdef vector[string] _make_str_vector(list vals): - cdef vector[string] res - for val in vals: - res.push_back((val).encode()) - return res - - -def read_csv( - SourceInfo source_info, - *, - compression_type compression = compression_type.AUTO, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - list col_names = None, - str prefix = "", - bool mangle_dupe_cols = True, - list usecols = None, - size_type nrows = -1, - size_type skiprows = 0, - size_type skipfooter = 0, - size_type header = 0, - str lineterminator = "\n", - str delimiter = None, - str thousands = None, - str decimal = ".", - str comment = None, - bool delim_whitespace = False, - bool skipinitialspace = False, - bool skip_blank_lines = True, - quote_style quoting = quote_style.MINIMAL, - str quotechar = '"', - bool doublequote = True, - list parse_dates = None, - list parse_hex = None, - # Technically this should be dict/list - # but using a fused type prevents using None as default - object dtypes = None, - list true_values = None, - list false_values = None, - list na_values = None, - bool keep_default_na = True, - bool na_filter = True, - bool dayfirst = False, - # Note: These options are supported by the libcudf reader - # but are not exposed here since there is no demand for them - # on the Python side yet. - # bool detect_whitespace_around_quotes = False, - # DataType timestamp_type = DataType(type_id.EMPTY), + for hx in val: + if isinstance(hx, str): + vec_str.push_back(hx.encode()) + else: + vec_int.push_back(hx) + + self.c_obj.set_parse_hex(vec_str) + self.c_obj.set_parse_hex(vec_int) + + cpdef void set_dtypes(self, object types): + """ + Sets per-column types. + + Parameters + ---------- + types : dict[str, data_type] | list[data_type] + Column name to data type map specifying the columns' target data types. + Or a list specifying the columns' target data types. + + Returns + ------- + None + """ + cdef map[string, data_type] dtype_map + cdef vector[data_type] dtype_list + if isinstance(types, dict): + for name, dtype in types.items(): + dtype_map[str(name).encode()] = (dtype).c_obj + self.c_obj.set_dtypes(dtype_map) + elif isinstance(types, list): + for dtype in types: + dtype_list.push_back((dtype).c_obj) + self.c_obj.set_dtypes(dtype_list) + else: + raise TypeError("Must pass an dict or list") + + cpdef void set_true_values(self, list true_values): + """ + Sets additional values to recognize as boolean true values. + + Parameters + ---------- + true_values : list[str] + List of values to be considered to be true + + Returns + ------- + None + """ + cdef vector[string] vec + for val in true_values: + vec.push_back(val.encode()) + self.c_obj.set_true_values(vec) + + cpdef void set_false_values(self, list false_values): + """ + Sets additional values to recognize as boolean false values. + + Parameters + ---------- + false_values : list[str] + List of values to be considered to be false + + Returns + ------- + None + """ + cdef vector[string] vec + for val in false_values: + vec.push_back(val.encode()) + self.c_obj.set_false_values(vec) + + cpdef void set_na_values(self, list na_values): + """ + Sets additional values to recognize as null values. + + Parameters + ---------- + na_values : list[str] + List of values to be considered to be null + + Returns + ------- + None + """ + cdef vector[string] vec + for val in na_values: + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) + + +cdef class CsvReaderOptionsBuilder: + """ + Builder to build options for ``read_csv`` + + For details, see :cpp:class:`cudf::io::csv_reader_options_builder` + """ + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression format of the source. + + Parameters + ---------- + compression : compression_type + Compression type + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.compression(compression) + return self + + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + """ + Sets whether to rename duplicate column names. + + Parameters + ---------- + mangle_dupe_cols : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.mangle_dupe_cols(mangle_dupe_cols) + return self + + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Sets number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_offset(byte_range_offset) + return self + + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Sets number of bytes to read. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_size(byte_range_size) + return self + + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.nrows(nrows) + return self + + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + """ + Sets number of rows to skip from start. + + Parameters + ---------- + skiprows : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skiprows(skiprows) + return self + + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + """ + Sets number of rows to skip from end. + + Parameters + ---------- + skipfooter : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipfooter(skipfooter) + return self + + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + """ + Sets quoting style. + + Parameters + ---------- + quoting : quote_style + Quoting style used + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quoting(quoting) + return self + + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + """ + Sets line terminator. + + Parameters + ---------- + quoting : str + A character to indicate line termination + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.lineterminator(ord(lineterminator)) + return self + + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + """ + Sets quoting character. + + Parameters + ---------- + quotechar : str + A character to indicate quoting + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quotechar(ord(quotechar)) + return self + + cpdef CsvReaderOptionsBuilder decimal(self, str decimal): + """ + Sets decimal point character. + + Parameters + ---------- + quotechar : str + A character that indicates decimal values + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.decimal(ord(decimal)) + return self + + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + """ + Sets whether to treat whitespace as field delimiter. + + Parameters + ---------- + delim_whitespace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.delim_whitespace(delim_whitespace) + return self + + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + """ + Sets whether to skip whitespace after the delimiter. + + Parameters + ---------- + skipinitialspace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipinitialspace(skipinitialspace) + return self + + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + """ + Sets whether to ignore empty lines or parse line values as invalid. + + Parameters + ---------- + skip_blank_lines : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skip_blank_lines(skip_blank_lines) + return self + + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + """ + Sets a quote inside a value is double-quoted. + + Parameters + ---------- + doublequote : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.doublequote(doublequote) + return self + + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + """ + Sets whether to keep the built-in default NA values. + + Parameters + ---------- + keep_default_na : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.keep_default_na(keep_default_na) + return self + + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + """ + Sets whether to disable null filter. + + Parameters + ---------- + na_filter : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.na_filter(na_filter) + return self + + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + """ + Sets whether to parse dates as DD/MM versus MM/DD. + + Parameters + ---------- + dayfirst : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.dayfirst(dayfirst) + return self + + cpdef CsvReaderOptions build(self): + """Create a CsvReaderOptions object""" + cdef CsvReaderOptions csv_options = CsvReaderOptions.__new__( + CsvReaderOptions + ) + csv_options.c_obj = move(self.c_obj.build()) + csv_options.source = self.source + return csv_options + + +cpdef TableWithMetadata read_csv( + CsvReaderOptions options ): - """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. + """ + Read from CSV format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_csv`. Parameters ---------- - source_info : SourceInfo - The SourceInfo to read the CSV file from. - compression : compression_type, default CompressionType.AUTO - The compression format of the CSV source. - byte_range_offset : size_type, default 0 - Number of bytes to skip from source start. - byte_range_size : size_type, default 0 - Number of bytes to read. By default, will read all bytes. - col_names : list, default None - The column names to use. - prefix : string, default '' - The prefix to apply to the column names. - mangle_dupe_cols : bool, default True - If True, rename duplicate column names. - usecols : list, default None - Specify the string column names/integer column indices of columns to be read. - nrows : size_type, default -1 - The number of rows to read. - skiprows : size_type, default 0 - The number of rows to skip from the start before reading - skipfooter : size_type, default 0 - The number of rows to skip from the end - header : size_type, default 0 - The index of the row that will be used for header names. - Pass -1 to use default column names. - lineterminator : str, default '\\n' - The character used to determine the end of a line. - delimiter : str, default "," - The character used to separate fields in a row. - thousands : str, default None - The character used as the thousands separator. - Cannot match delimiter. - decimal : str, default '.' - The character used as the decimal separator. - Cannot match delimiter. - comment : str, default None - The character used to identify the start of a comment line. - (which will be skipped by the reader) - delim_whitespace : bool, default False - If True, treat whitespace as the field delimiter. - skipinitialspace : bool, default False - If True, skip whitespace after the delimiter. - skip_blank_lines : bool, default True - If True, ignore empty lines (otherwise line values are parsed as null). - quoting : QuoteStyle, default QuoteStyle.MINIMAL - The quoting style used in the input CSV data. One of - { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE } - quotechar : str, default '"' - The character used to indicate quoting. - doublequote : bool, default True - If True, a quote inside a value is double-quoted. - parse_dates : list, default None - A list of integer column indices/string column names - of columns to read as datetime. - parse_hex : list, default None - A list of integer column indices/string column names - of columns to read as hexadecimal. - dtypes : Union[Dict[str, DataType], List[DataType]], default None - A list of data types or a dictionary mapping column names - to a DataType. - true_values : List[str], default None - A list of additional values to recognize as True. - false_values : List[str], default None - A list of additional values to recognize as False. - na_values : List[str], default None - A list of additional values to recognize as null. - keep_default_na : bool, default True - Whether to keep the built-in default N/A values. - na_filter : bool, default True - Whether to detect missing values. If False, can - improve performance. - dayfirst : bool, default False - If True, interpret dates as being in the DD/MM format. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: CsvReaderOptions + Settings for controlling reading behavior """ - cdef vector[string] c_parse_dates_names - cdef vector[int] c_parse_dates_indexes - cdef vector[int] c_parse_hex_names - cdef vector[int] c_parse_hex_indexes - cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map - - cdef csv_reader_options options = ( - csv_reader_options.builder(source_info.c_obj) - .compression(compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range_offset) - .byte_range_size(byte_range_size) - .nrows(nrows) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(ord(lineterminator)) - .quotechar(ord(quotechar)) - .decimal(ord(decimal)) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if col_names is not None: - options.set_names([str(name).encode() for name in col_names]) - - if prefix is not None: - options.set_prefix(prefix.encode()) - - if usecols is not None: - if all([isinstance(col, int) for col in usecols]): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name).encode() for name in usecols]) - - if delimiter is not None: - options.set_delimiter(ord(delimiter)) - - if thousands is not None: - options.set_thousands(ord(thousands)) - - if comment is not None: - options.set_comment(ord(comment)) - - if parse_dates is not None: - if not all([isinstance(col, (str, int)) for col in parse_dates]): - raise NotImplementedError( - "`parse_dates`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_dates_names, c_parse_dates_indexes = \ - _process_parse_dates_hex(parse_dates) - options.set_parse_dates(c_parse_dates_names) - options.set_parse_dates(c_parse_dates_indexes) - - if parse_hex is not None: - if not all([isinstance(col, (str, int)) for col in parse_hex]): - raise NotImplementedError( - "`parse_hex`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) - options.set_parse_hex(c_parse_hex_names) - options.set_parse_hex(c_parse_hex_indexes) - - if isinstance(dtypes, list): - for dtype in dtypes: - c_dtypes_list.push_back((dtype).c_obj) - options.set_dtypes(c_dtypes_list) - elif isinstance(dtypes, dict): - # dtypes_t is dict - for k, v in dtypes.items(): - c_dtypes_map[str(k).encode()] = (v).c_obj - options.set_dtypes(c_dtypes_map) - elif dtypes is not None: - raise TypeError("dtypes must either by a list/dict") - - if true_values is not None: - options.set_true_values(_make_str_vector(true_values)) - - if false_values is not None: - options.set_false_values(_make_str_vector(false_values)) - - if na_values is not None: - options.set_na_values(_make_str_vector(na_values)) - cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_csv(options)) + c_result = move(cpp_read_csv(options.c_obj)) - return TableWithMetadata.from_libcudf(c_result) + cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result) + return tbl_meta # TODO: Implement the remaining methods diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 90d2d0896a5..1cbaac57315 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -77,14 +77,16 @@ def test_read_csv_basic( offset=skiprows, length=nrows if nrows != -1 else None ) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - delimiter=delimiter, - compression=compression_type, - col_names=column_names, - nrows=nrows, - skiprows=skiprows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .nrows(nrows) + .skiprows(skiprows) + .build() ) + options.set_delimiter(delimiter) + options.set_names([str(name) for name in column_names]) + res = plc.io.csv.read_csv(options) assert_table_and_meta_eq( pa_table, @@ -110,15 +112,15 @@ def test_read_csv_byte_range(table_data, chunk_size, tmp_path): file_size = os.stat(source).st_size tbls_w_meta = [] for segment in range((file_size + chunk_size - 1) // chunk_size): - tbls_w_meta.append( - plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - byte_range_offset=segment * chunk_size, - byte_range_size=chunk_size, - header=-1, - col_names=pa_table.column_names, - ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .byte_range_offset(segment * chunk_size) + .byte_range_size(chunk_size) + .build() ) + options.set_header(-1) + options.set_names([str(name) for name in pa_table.column_names]) + tbls_w_meta.append(plc.io.csv.read_csv(options)) if isinstance(source, io.IOBase): source.seek(0) exp = pd.read_csv(source, names=pa_table.column_names, header=None) @@ -161,9 +163,16 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): new_schema = pa.schema(new_fields) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_dtypes(dtypes) + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + res = plc.io.csv.read_csv(options) new_table = pa_table.cast(new_schema) assert_table_and_meta_eq(new_table, res) @@ -171,7 +180,7 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): @pytest.mark.parametrize("skip_blanks", [True, False]) @pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')]) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\t"]) def test_read_csv_parse_options( source_or_sink, decimal, quotechar, skip_blanks, lineterminator ): @@ -188,19 +197,25 @@ def test_read_csv_parse_options( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - comment="#", - decimal=decimal, - skip_blank_lines=skip_blanks, - quotechar=quotechar, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .lineterminator(lineterminator) + .quotechar(quotechar) + .decimal(decimal) + .skip_blank_lines(skip_blanks) + .build() ) + options.set_comment("#") + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), comment="#", decimal=decimal, skip_blank_lines=skip_blanks, quotechar=quotechar, + lineterminator=lineterminator, ) assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) @@ -216,12 +231,17 @@ def test_read_csv_na_values( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - na_filter=na_filter, - na_values=na_values if na_filter else None, - keep_default_na=keep_default_na, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .build() ) + if na_filter and na_values is not None: + options.set_na_values(na_values) + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), na_filter=na_filter, @@ -241,9 +261,11 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): **_COMMON_CSV_SOURCE_KWARGS, ) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), header=header - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_header(header) + plc_table_w_meta = plc.io.csv.read_csv(options) if header > 0: if header < len(pa_table): names_row = pa_table.take([header - 1]).to_pylist()[0].values() From 6e91f095e204c03c6771a1b044e5f55cfe8199a6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Nov 2024 16:46:00 -0800 Subject: [PATCH 11/14] Remove cudf._lib.null_mask in favor of inlining pylibcudf (#17440) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17440 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/column.pyx | 21 +++++--- python/cudf/cudf/_lib/null_mask.pyx | 65 ------------------------ python/cudf/cudf/core/column/column.py | 23 +++++---- python/cudf/cudf/core/column/lists.py | 7 +-- python/cudf/cudf/core/column/string.py | 6 +-- python/cudf/cudf/core/column/struct.py | 5 +- python/cudf/cudf/core/dataframe.py | 9 ++-- python/cudf/cudf/core/groupby/groupby.py | 4 +- python/cudf/cudf/testing/_utils.py | 5 +- python/cudf/cudf/utils/utils.py | 3 +- 12 files changed, 43 insertions(+), 107 deletions(-) delete mode 100644 python/cudf/cudf/_lib/null_mask.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 45e0fc345b5..f7c9f6df8ad 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -24,7 +24,6 @@ set(cython_sources interop.pyx json.pyx merge.pyx - null_mask.pyx orc.pyx parquet.pyx reduce.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index c51db601985..7474c4e8cd1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -11,7 +11,6 @@ interop, json, merge, - null_mask, nvtext, orc, parquet, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 94dbdf5534d..9cbe11d61ac 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,7 +11,6 @@ import pylibcudf import rmm import cudf -import cudf._lib as libcudf from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -36,7 +35,6 @@ from cudf._lib.types cimport ( dtype_to_pylibcudf_type, ) -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column cimport pylibcudf.libcudf.copying as cpp_copying @@ -159,7 +157,10 @@ cdef class Column: if self.base_mask is None or self.offset == 0: self._mask = self.base_mask else: - self._mask = libcudf.null_mask.copy_bitmask(self) + with acquire_spill_lock(): + self._mask = as_buffer( + pylibcudf.null_mask.copy_bitmask(self.to_pylibcudf(mode="read")) + ) return self._mask @property @@ -183,7 +184,9 @@ cdef class Column: if value is not None: # bitmask size must be relative to offset = 0 data. - required_size = bitmask_allocation_size_bytes(self.base_size) + required_size = pylibcudf.null_mask.bitmask_allocation_size_bytes( + self.base_size + ) if value.size < required_size: error_msg = ( "The Buffer for mask is smaller than expected, " @@ -220,7 +223,7 @@ cdef class Column: and compute new data Buffers zero-copy that use pointer arithmetic to properly adjust the pointer. """ - mask_size = bitmask_allocation_size_bytes(self.size) + mask_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(self.size) required_num_bytes = -(-self.size // 8) # ceiling divide error_msg = ( "The value for mask is smaller than expected, got {} bytes, " @@ -790,13 +793,17 @@ cdef class Column: mask = as_buffer( rmm.DeviceBuffer( ptr=mask_ptr, - size=bitmask_allocation_size_bytes(base_size) + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ) ) ) else: mask = as_buffer( data=mask_ptr, - size=bitmask_allocation_size_bytes(base_size), + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ), owner=mask_owner, exposed=True ) diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx deleted file mode 100644 index d54e8e66281..00000000000 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf -from pylibcudf.null_mask import MaskState - -from cudf.core.buffer import acquire_spill_lock, as_buffer - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def copy_bitmask(Column col): - """ - Copies column's validity mask buffer into a new buffer, shifting by the - offset if nonzero - """ - if col.base_mask is None: - return None - - rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read")) - buf = as_buffer(rmm_db) - return buf - - -def bitmask_allocation_size_bytes(num_bits): - """ - Given a size, calculates the number of bytes that should be allocated for a - column validity mask - """ - return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits) - - -def create_null_mask(size, state=MaskState.UNINITIALIZED): - """ - Given a size and a mask state, allocate a mask that can properly represent - the given size with the given mask state - - Parameters - ---------- - size : int - Number of elements the mask needs to be able to represent - state : ``MaskState``, default ``MaskState.UNINITIALIZED`` - State the null mask should be created in - """ - rmm_db = pylibcudf.null_mask.create_null_mask(size, state) - buf = as_buffer(rmm_db) - return buf - - -@acquire_spill_lock() -def bitmask_and(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_and( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other - - -@acquire_spill_lock() -def bitmask_or(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_or( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f0df4a3c1b3..8ddfd4a54ae 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,11 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.null_mask import ( - MaskState, - bitmask_allocation_size_bytes, - create_null_mask, -) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( apply_boolean_mask, @@ -383,7 +378,7 @@ def memory_usage(self) -> int: if self.data is not None: n += self.data.size if self.nullable: - n += bitmask_allocation_size_bytes(self.size) + n += plc.null_mask.bitmask_allocation_size_bytes(self.size) return n def _fill( @@ -410,7 +405,11 @@ def _fill( ) if not slr.is_valid() and not self.nullable: - mask = create_null_mask(self.size, state=MaskState.ALL_VALID) + mask = as_buffer( + plc.null_mask.create_null_mask( + self.size, plc.null_mask.MaskState.ALL_VALID + ) + ) self.set_base_mask(mask) libcudf.filling.fill_in_place(self, begin, end, slr.device_value) @@ -1553,7 +1552,11 @@ def column_empty( data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) if masked: - mask = create_null_mask(row_count, state=MaskState.ALL_NULL) + mask = as_buffer( + plc.null_mask.create_null_mask( + row_count, plc.null_mask.MaskState.ALL_NULL + ) + ) else: mask = None @@ -2210,7 +2213,9 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": - mask_size = bitmask_allocation_size_bytes(desc["shape"][0]) + mask_size = plc.null_mask.bitmask_allocation_size_bytes( + desc["shape"][0] + ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": col = as_column(cai_mask) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 9962663e811..42df5123014 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -69,10 +69,7 @@ def __init__( @cached_property def memory_usage(self): - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize current_base_child = self.base_children[1] current_offset = self.offset @@ -97,7 +94,7 @@ def memory_usage(self): ) * current_base_child.dtype.itemsize if current_base_child.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( + n += plc.null_mask.bitmask_allocation_size_bytes( current_base_child.size ) return n diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a9ab2d373fd..47763063c4c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5750,17 +5750,13 @@ def end_offset(self) -> int: @cached_property def memory_usage(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size + n = super().memory_usage if len(self.base_children) == 1: child0_size = (self.size + 1) * self.base_children[ 0 ].dtype.itemsize n += child0_size - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) return n @property diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 8f16ba4e15b..2adc6b54bab 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -101,10 +101,7 @@ def to_pandas( @cached_property def memory_usage(self) -> int: - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage for child in self.children: n += child.memory_usage diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 73c0af45293..b58ab13be93 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -45,7 +45,7 @@ from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -3191,9 +3191,10 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): out.append(result._with_type_metadata(col.dtype)) else: - out_mask = cudf._lib.null_mask.create_null_mask( - len(source_col), - state=cudf._lib.null_mask.MaskState.ALL_NULL, + out_mask = as_buffer( + plc.null_mask.create_null_mask( + len(source_col), plc.null_mask.MaskState.ALL_NULL + ) ) out.append(source_col.set_mask(out_mask)) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 315324c130c..e977f037b79 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.null_mask import bitmask_or from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -1118,8 +1117,7 @@ def ngroup(self, ascending=True): """ index = self.grouping.keys.unique().sort_values() num_groups = len(index) - _, has_null_group = bitmask_or([*index._columns]) - + has_null_group = any(col.has_nulls() for col in index._columns) if ascending: # Count ascending from 0 to num_groups - 1 groups = range(num_groups) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index a5dc8a5498c..6624a1a150e 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,8 +15,9 @@ from numba.cuda.cudadecl import registry as cuda_decl_registry from numba.cuda.cudaimpl import lower as cuda_lower +import pylibcudf as plc + import cudf -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -91,7 +92,7 @@ def random_bitmask(size): size : int number of bits """ - sz = bitmask_allocation_size_bytes(size) + sz = plc.null_mask.bitmask_allocation_size_bytes(size) rng = np.random.default_rng(seed=0) data = rng.integers(0, 255, dtype="u1", size=sz) return data.view("i1") diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 294253cd119..e6d252b8807 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import pylibcudf as plc import rmm import cudf @@ -252,7 +253,7 @@ def pa_mask_buffer_to_mask(mask_buf, size): """ Convert PyArrow mask buffer to cuDF mask buffer """ - mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size) + mask_size = plc.null_mask.bitmask_allocation_size_bytes(size) if mask_buf.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) From 6eaa65f6dbf396c6035bf987299f5cbb99157597 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:00:54 -0800 Subject: [PATCH 12/14] Remove some cudf._lib.strings files in favor of inlining pylibcudf (#17394) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17394 --- python/cudf/cudf/_lib/strings/CMakeLists.txt | 30 - python/cudf/cudf/_lib/strings/__init__.py | 56 -- python/cudf/cudf/_lib/strings/attributes.pyx | 43 - python/cudf/cudf/_lib/strings/capitalize.pyx | 34 - python/cudf/cudf/_lib/strings/case.pyx | 34 - python/cudf/cudf/_lib/strings/char_types.pyx | 141 --- python/cudf/cudf/_lib/strings/combine.pyx | 90 -- python/cudf/cudf/_lib/strings/contains.pyx | 60 -- python/cudf/cudf/_lib/strings/extract.pyx | 24 - python/cudf/cudf/_lib/strings/find.pyx | 139 --- .../cudf/cudf/_lib/strings/find_multiple.pyx | 20 - python/cudf/cudf/_lib/strings/findall.pyx | 41 - python/cudf/cudf/_lib/strings/json.pyx | 26 - python/cudf/cudf/_lib/strings/padding.pyx | 73 -- python/cudf/cudf/_lib/strings/repeat.pyx | 38 - python/cudf/cudf/_lib/strings/replace.pyx | 88 -- python/cudf/cudf/_lib/strings/replace_re.pyx | 69 -- python/cudf/cudf/_lib/strings/strip.pyx | 54 - python/cudf/cudf/_lib/strings/substring.pyx | 85 -- python/cudf/cudf/_lib/strings/translate.pyx | 42 - python/cudf/cudf/_lib/strings/wrap.pyx | 24 - python/cudf/cudf/core/column/string.py | 952 ++++++++++-------- python/cudf/cudf/core/tools/numeric.py | 5 +- python/cudf/cudf/tests/test_string.py | 2 +- 24 files changed, 550 insertions(+), 1620 deletions(-) delete mode 100644 python/cudf/cudf/_lib/strings/attributes.pyx delete mode 100644 python/cudf/cudf/_lib/strings/capitalize.pyx delete mode 100644 python/cudf/cudf/_lib/strings/case.pyx delete mode 100644 python/cudf/cudf/_lib/strings/char_types.pyx delete mode 100644 python/cudf/cudf/_lib/strings/combine.pyx delete mode 100644 python/cudf/cudf/_lib/strings/contains.pyx delete mode 100644 python/cudf/cudf/_lib/strings/extract.pyx delete mode 100644 python/cudf/cudf/_lib/strings/find.pyx delete mode 100644 python/cudf/cudf/_lib/strings/find_multiple.pyx delete mode 100644 python/cudf/cudf/_lib/strings/findall.pyx delete mode 100644 python/cudf/cudf/_lib/strings/json.pyx delete mode 100644 python/cudf/cudf/_lib/strings/padding.pyx delete mode 100644 python/cudf/cudf/_lib/strings/repeat.pyx delete mode 100644 python/cudf/cudf/_lib/strings/replace.pyx delete mode 100644 python/cudf/cudf/_lib/strings/replace_re.pyx delete mode 100644 python/cudf/cudf/_lib/strings/strip.pyx delete mode 100644 python/cudf/cudf/_lib/strings/substring.pyx delete mode 100644 python/cudf/cudf/_lib/strings/translate.pyx delete mode 100644 python/cudf/cudf/_lib/strings/wrap.pyx diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index ceeff71683c..dca9c4cc3fc 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -11,35 +11,5 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= - -set(cython_sources - attributes.pyx - capitalize.pyx - case.pyx - char_types.pyx - combine.pyx - contains.pyx - extract.pyx - find.pyx - find_multiple.pyx - findall.pyx - json.pyx - padding.pyx - repeat.pyx - replace.pyx - replace_re.pyx - strip.pyx - substring.pyx - translate.pyx - wrap.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) - add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4c0ec2d9ac5..b795c54c112 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -32,62 +32,10 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.attributes import ( - code_points, - count_bytes, - count_characters, -) -from cudf._lib.strings.capitalize import capitalize, is_title, title -from cudf._lib.strings.case import swapcase, to_lower, to_upper -from cudf._lib.strings.char_types import ( - filter_alphanum, - is_alnum, - is_alpha, - is_decimal, - is_digit, - is_lower, - is_numeric, - is_space, - is_upper, -) -from cudf._lib.strings.combine import ( - concatenate, - join, - join_lists_with_column, - join_lists_with_scalar, -) -from cudf._lib.strings.contains import contains_re, count_re, like, match_re from cudf._lib.strings.convert.convert_fixed_point import to_decimal from cudf._lib.strings.convert.convert_floats import is_float from cudf._lib.strings.convert.convert_integers import is_integer from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.extract import extract -from cudf._lib.strings.find import ( - contains, - contains_multiple, - endswith, - endswith_multiple, - find, - rfind, - startswith, - startswith_multiple, -) -from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import find_re, findall -from cudf._lib.strings.json import get_json_object -from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill -from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence -from cudf._lib.strings.replace import ( - insert, - replace, - replace_multi, - slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re, - replace_re, - replace_with_backrefs, -) from cudf._lib.strings.split.partition import partition, rpartition from cudf._lib.strings.split.split import ( rsplit, @@ -99,7 +47,3 @@ split_record, split_record_re, ) -from cudf._lib.strings.strip import lstrip, rstrip, strip -from cudf._lib.strings.substring import get, slice_from, slice_strings -from cudf._lib.strings.translate import filter_characters, translate -from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx deleted file mode 100644 index df81b3942b4..00000000000 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def count_characters(Column source_strings): - """ - Returns an integer numeric column containing the - length of each string in characters. - """ - plc_column = plc.strings.attributes.count_characters( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def count_bytes(Column source_strings): - """ - Returns an integer numeric column containing the - number of bytes of each string. - """ - plc_column = plc.strings.attributes.count_bytes( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def code_points(Column source_strings): - """ - Creates a numeric column with code point values (integers) - for each character of each string. - """ - plc_column = plc.strings.attributes.code_points( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx deleted file mode 100644 index 42c40e2e753..00000000000 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def capitalize(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.capitalize( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.title( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def is_title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.is_title( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx deleted file mode 100644 index ad4cbb6f088..00000000000 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import case - - -@acquire_spill_lock() -def to_upper(Column source_strings): - return Column.from_pylibcudf( - case.to_upper( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def to_lower(Column source_strings): - return Column.from_pylibcudf( - case.to_lower( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def swapcase(Column source_strings): - return Column.from_pylibcudf( - case.swapcase( - source_strings.to_pylibcudf(mode='read') - ) - ) diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx deleted file mode 100644 index a57ce29eb45..00000000000 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import char_types - - -@acquire_spill_lock() -def filter_alphanum(Column source_strings, object py_repl, bool keep=True): - """ - Returns a Column of strings keeping only alphanumeric character types. - """ - plc_column = char_types.filter_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALL_TYPES if keep - else char_types.StringCharacterTypes.ALPHANUM, - py_repl.device_value.c_value, - char_types.StringCharacterTypes.ALPHANUM if keep - else char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_decimal(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal characters -- those that can be used - to extract base10 numbers. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DECIMAL, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alnum(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphanumeric characters. - - Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHANUM, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_alpha(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphabetic characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.ALPHA, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_digit(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal and digit characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.DIGIT, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_numeric(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only numeric characters. These include digit and - numeric characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.NUMERIC, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_upper(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only upper-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.UPPER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_lower(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only lower-case characters. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.LOWER, - char_types.StringCharacterTypes.CASE_TYPES - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def is_space(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contains all characters which are spaces only. - """ - plc_column = char_types.all_characters_of_type( - source_strings.to_pylibcudf(mode="read"), - char_types.StringCharacterTypes.SPACE, - char_types.StringCharacterTypes.ALL_TYPES - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx deleted file mode 100644 index 0f7b27d85d7..00000000000 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - -import cudf - - -@acquire_spill_lock() -def concatenate(list source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.concatenate( - plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join(Column source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - plc_column = plc.strings.combine.join_strings( - source_strings.to_pylibcudf(mode="read"), - sep.device_value.c_value, - na_rep.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_scalar( - Column source_strings, - object py_separator, - object py_narep): - """ - Returns a Column by concatenating Lists of strings row-wise - in `source_strings` with the specified `py_separator` - between each string in lists and ``/`None` values - are replaced by `py_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - py_separator.device_value.c_value, - py_narep.device_value.c_value, - cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def join_lists_with_column( - Column source_strings, - Column separator_strings, - object py_source_narep, - object py_separator_narep): - """ - Returns a Column by concatenating Lists of strings row-wise in - `source_strings` with a corresponding separator at the same - position in `separator_strings` and ``/`None` values in - `source_strings` are replaced by `py_source_narep` and - ``/`None` values in `separator_strings` are replaced - by `py_separator_narep` - """ - plc_column = plc.strings.combine.join_list_elements( - source_strings.to_pylibcudf(mode="read"), - separator_strings.to_pylibcudf(mode="read"), - py_separator_narep.device_value.c_value, - py_source_narep.device_value.c_value, - plc.strings.combine.SeparatorOnNulls.YES, - plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx deleted file mode 100644 index 03b4887f200..00000000000 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import contains -from pylibcudf.strings.regex_program import RegexProgram - - -@acquire_spill_lock() -def contains_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column of boolean values with True for `source_strings` - that contain regular expression `reg_ex`. - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def count_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with count of occurrences of `reg_ex` in - each string of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def match_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with each value True if the string matches `reg_ex` - regular expression with each record of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def like(Column source_strings, object py_pattern, object py_escape): - """ - Returns a Column with each value True if the string matches the - `py_pattern` like expression with each record of `source_strings` - """ - plc_column = contains.like( - source_strings.to_pylibcudf(mode="read"), - py_pattern.device_value.c_value, - py_escape.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx deleted file mode 100644 index 5bf336f4f3c..00000000000 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def extract(Column source_strings, object pattern, uint32_t flags): - """ - Returns data which contains extracted capture groups provided in - `pattern` for all `source_strings`. - The returning data contains one row for each subject string, - and one column for each group. - """ - prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) - plc_result = plc.strings.extract.extract( - source_strings.to_pylibcudf(mode="read"), prog - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx deleted file mode 100644 index 2d284d1aa9d..00000000000 --- a/python/cudf/cudf/_lib/strings/find.pyx +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def contains(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def contains_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the corresponding string in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def endswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with the pattern given in `py_target`. - """ - - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def endswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def startswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that start with the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def startswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that begin with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def find(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing lowest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - return Column.from_pylibcudf( - plc.strings.find.find( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) - - -@acquire_spill_lock() -def rfind(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing highest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - - return Column.from_pylibcudf( - plc.strings.find.rfind( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx deleted file mode 100644 index 39e0013769f..00000000000 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def find_multiple(Column source_strings, Column target_strings): - """ - Returns a column with character position values where each - of the `target_strings` are found in each string of `source_strings`. - """ - plc_result = plc.strings.find_multiple.find_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx deleted file mode 100644 index 3e7a504d535..00000000000 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def findall(Column source_strings, object pattern, uint32_t flags): - """ - Returns data with all non-overlapping matches of `pattern` - in each string of `source_strings` as a lists column. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.findall( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def find_re(Column source_strings, object pattern, uint32_t flags): - """ - Returns character positions where the pattern first matches - the elements in source_strings. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.find_re( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx deleted file mode 100644 index 374a104635a..00000000000 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc -from pylibcudf.json cimport GetJsonObjectOptions - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def get_json_object( - Column col, - object py_json_path, - GetJsonObjectOptions options -): - """ - Apply a JSONPath string to all rows in an input column - of json strings. - """ - plc_column = plc.json.get_json_object( - col.to_pylibcudf(mode="read"), - py_json_path.device_value.c_value, - options - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx deleted file mode 100644 index 015a2ebab8a..00000000000 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def pad(Column source_strings, - size_type width, - fill_char, - side=plc.strings.side_type.SideType.LEFT): - """ - Returns a Column by padding strings in `source_strings` - up to the given `width`. Direction of padding is to be specified by `side`. - The additional characters being filled can be changed by specifying - `fill_char`. - """ - plc_result = plc.strings.padding.pad( - source_strings.to_pylibcudf(mode="read"), - width, - side, - fill_char, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def zfill(Column source_strings, - size_type width): - """ - Returns a Column by prepending strings in `source_strings` - with '0' characters up to the given `width`. - """ - plc_result = plc.strings.padding.zfill( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) - - -def center(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left and right side of strings - in `source_strings` with additional character, `fill_char` - up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - - -def ljust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling right side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - - -def rjust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx deleted file mode 100644 index 43649d4defe..00000000000 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def repeat_scalar(Column source_strings, - size_type repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def repeat_sequence(Column source_strings, - Column repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx deleted file mode 100644 index a260c4e4f45..00000000000 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_replace(Column source_strings, - size_type start, - size_type stop, - object py_repl): - """ - Returns a Column by replacing specified section - of each string with `py_repl`. Positions can be - specified with `start` and `stop` params. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - stop - )) - - -@acquire_spill_lock() -def insert(Column source_strings, - size_type start, - object py_repl): - """ - Returns a Column by inserting a specified - string `py_repl` at a specific position in all strings. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - start, - )) - - -@acquire_spill_lock() -def replace(Column source_strings, - object py_target, - object py_repl, - int32_t maxrepl): - """ - Returns a Column after replacing occurrences of - patterns `py_target` with `py_repl` in `source_strings`. - `maxrepl` indicates number of replacements to make from start. - """ - cdef DeviceScalar target = py_target.device_value - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace( - source_strings.to_pylibcudf(mode="read"), - target.c_value, - repl.c_value, - maxrepl - )) - - -@acquire_spill_lock() -def replace_multi(Column source_strings, - Column target_strings, - Column repl_strings): - """ - Returns a Column after replacing occurrences of - patterns `target_strings` with `repl_strings` in `source_strings`. - """ - return Column.from_pylibcudf(plc.strings.replace.replace_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read"), - repl_strings.to_pylibcudf(mode="read"), - )) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx deleted file mode 100644 index 462d5c903e8..00000000000 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from pylibcudf.libcudf.types cimport size_type -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def replace_re(Column source_strings, - object pattern, - object py_repl, - size_type n): - """ - Returns a Column after replacing occurrences regular - expressions `pattern` with `py_repl` in `source_strings`. - `n` indicates the number of resplacements to be made from - start. (-1 indicates all) - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - py_repl.device_value.c_value, - n - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_with_backrefs( - Column source_strings, - object pattern, - object repl): - """ - Returns a Column after using the `repl` back-ref template to create - new string with the extracted elements found using - `pattern` regular expression in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_with_backrefs( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT - ), - repl - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def replace_multi_re(Column source_strings, - list patterns, - Column repl_strings): - """ - Returns a Column after replacing occurrences of multiple - regular expressions `patterns` with their corresponding - strings in `repl_strings` in `source_strings`. - """ - plc_column = plc.strings.replace_re.replace_re( - source_strings.to_pylibcudf(mode="read"), - patterns, - repl_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx deleted file mode 100644 index 982c5a600e7..00000000000 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -import pylibcudf as plc - - -@acquire_spill_lock() -def strip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left and right side - can be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.BOTH, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def lstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.LEFT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def rstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from right side can - be specified by `py_repl`. - """ - plc_result = plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.side_type.SideType.RIGHT, - py_repl.device_value.c_value, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx deleted file mode 100644 index db96d99c7b6..00000000000 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_strings(Column source_strings, - object start, - object end, - object step): - """ - Returns a Column by extracting a substring of each string - at given start and end positions. Slicing can also be - performed in steps by skipping `step` number of - characters in a string. - """ - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) - - -@acquire_spill_lock() -def slice_from(Column source_strings, - Column starts, - Column stops): - """ - Returns a Column by extracting a substring of each string - at given starts and stops positions. `starts` and `stops` - here are positions per element in the string-column. - """ - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - starts.to_pylibcudf(mode="read"), - stops.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def get(Column source_strings, - object index): - """ - Returns a Column which contains only single - character from each input string. The index of - characters required can be controlled by passing `index`. - """ - - if index < 0: - next_index = index - 1 - step = -1 - else: - next_index = index + 1 - step = 1 - cdef DeviceScalar start_scalar = as_device_scalar(index, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx deleted file mode 100644 index 3ef478532c2..00000000000 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def translate(Column source_strings, - object mapping_table): - """ - Translates individual characters within each string - if present in the mapping_table. - """ - plc_result = plc.strings.translate.translate( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def filter_characters(Column source_strings, - object mapping_table, - bool keep, - object py_repl): - """ - Removes or keeps individual characters within each string - using the provided mapping_table. - """ - plc_result = plc.strings.translate.filter_characters( - source_strings.to_pylibcudf(mode="read"), - mapping_table, - plc.strings.translate.FilterType.KEEP - if keep else plc.strings.translate.FilterType.REMOVE, - py_repl.device_value.c_value - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx deleted file mode 100644 index 2b40f01f818..00000000000 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def wrap(Column source_strings, - size_type width): - """ - Returns a Column by wrapping long strings - in the Column to be formatted in paragraphs - with length less than a given `width`. - """ - plc_result = plc.strings.wrap.wrap( - source_strings.to_pylibcudf(mode="read"), - width - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 47763063c4c..d45c76d3ddb 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2,14 +2,16 @@ from __future__ import annotations +import itertools import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, cast, overload +from typing import TYPE_CHECKING, Literal, cast, overload import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import pylibcudf as plc @@ -20,22 +22,15 @@ from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import can_convert_to_column - -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - return ( - libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") - ).fillna(False) - - if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Callable, Sequence import cupy import numba.cuda @@ -50,6 +45,16 @@ def str_to_boolean(column: StringColumn): from cudf.core.buffer import Buffer +def str_to_boolean(column: StringColumn): + """Takes in string column and returns boolean column""" + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + + _str_to_numeric_typecast_functions = { cudf.api.types.dtype("int8"): str_cast.stoi8, cudf.api.types.dtype("int16"): str_cast.stoi16, @@ -213,10 +218,12 @@ def len(self) -> SeriesOrIndex: 3 dtype: int32 """ - - return self._return_or_inplace( - libstrings.count_characters(self._column) - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def byte_count(self) -> SeriesOrIndex: """ @@ -245,9 +252,12 @@ def byte_count(self) -> SeriesOrIndex: 2 11 dtype: int32 """ - return self._return_or_inplace( - libstrings.count_bytes(self._column), - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_bytes( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) @overload def cat( @@ -347,19 +357,70 @@ def cat(self, others=None, sep=None, na_rep=None): sep = "" if others is None: - data = libstrings.join( - self._column, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) else: - other_cols = _get_cols_list(self._parent, others) - all_cols = [self._column] + other_cols - data = libstrings.concatenate( - all_cols, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), + parent_index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else self._parent ) + if ( + can_convert_to_column(others) + and len(others) > 0 + and ( + can_convert_to_column( + others.iloc[0] + if isinstance(others, cudf.Series) + else others[0] + ) + ) + ): + other_cols = ( + column.as_column(frame.reindex(parent_index), dtype="str") + if ( + parent_index is not None + and isinstance(frame, cudf.Series) + and not frame.index.equals(parent_index) + ) + else column.as_column(frame, dtype="str") + for frame in others + ) + elif others is not None and not isinstance(others, StringMethods): + if ( + parent_index is not None + and isinstance(others, cudf.Series) + and not others.index.equals(parent_index) + ): + others = others.reindex(parent_index) + + other_cols = [column.as_column(others, dtype="str")] + else: + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + [self._column], other_cols + ) + ] + ), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(na_rep, "str").device_value.c_value, + ) + data = Column.from_pylibcudf(plc_column) if len(data) == 1 and data.null_count == 1: data = cudf.core.column.as_column("", length=len(data)) @@ -516,9 +577,18 @@ def join( strings_column = self._split_by_character() if is_scalar(sep): - data = libstrings.join_lists_with_scalar( - strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + cudf._lib.scalar.DeviceScalar( + "", cudf.dtype("object") + ).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) elif can_convert_to_column(sep): sep_column = column.as_column(sep) if len(sep_column) != len(strings_column): @@ -531,13 +601,16 @@ def join( f"sep_na_rep should be a string scalar, got {sep_na_rep} " f"of type: {type(sep_na_rep)}" ) - - data = libstrings.join_lists_with_column( - strings_column, - sep_column, - cudf.Scalar(string_na_rep), - cudf.Scalar(sep_na_rep), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_list_elements( + strings_column.to_pylibcudf(mode="read"), + sep_column.to_pylibcudf(mode="read"), + cudf.Scalar(sep_na_rep).device_value.c_value, + cudf.Scalar(string_na_rep).device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + data = Column.from_pylibcudf(plc_column) else: raise TypeError( f"sep should be an str, array-like or Series object, " @@ -627,9 +700,18 @@ def extract( "unsupported value for `flags` parameter" ) - data = libstrings.extract(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.extract.extract( + self._column.to_pylibcudf(mode="read"), prog + ) + data = dict( + enumerate( + Column.from_pylibcudf(col) for col in plc_result.columns() + ) + ) if len(data) == 1 and expand is False: - _, data = data.popitem() + _, data = data.popitem() # type: ignore[assignment] return self._return_or_inplace(data, expand=expand) def contains( @@ -765,26 +847,41 @@ def contains( if is_scalar(pat): if regex: - result_col = libstrings.contains_re(self._column, pat, flags) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create( + pat, flags + ) + plc_result = plc.strings.contains.contains_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result_col = Column.from_pylibcudf(plc_result) else: if case is False: - input_column = libstrings.to_lower(self._column) - pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore + input_column = self.lower()._column # type: ignore[union-attr] + plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] else: input_column = self._column - pat = cudf.Scalar(pat, dtype="str") # type: ignore - result_col = libstrings.contains(input_column, pat) + plc_pat = cudf.Scalar(pat, dtype="str") + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + plc_pat.device_value.c_value, + ) + result_col = Column.from_pylibcudf(plc_result) else: # TODO: we silently ignore the `regex=` flag here if case is False: - input_column = libstrings.to_lower(self._column) - col_pat = libstrings.to_lower( - column.as_column(pat, dtype="str") - ) + input_column = self.lower()._column # type: ignore[union-attr] + col_pat = cudf.Index(pat, dtype="str").str.lower()._column # type: ignore[union-attr] else: input_column = self._column col_pat = column.as_column(pat, dtype="str") - result_col = libstrings.contains_multiple(input_column, col_pat) + with acquire_spill_lock(): + plc_result = plc.strings.find.contains( + input_column.to_pylibcudf(mode="read"), + col_pat.to_pylibcudf(mode="read"), + ) + result_col = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result_col) def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: @@ -850,11 +947,15 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: "expected esc to contain less than or equal to 1 characters" ) - result_col = libstrings.like( - self._column, cudf.Scalar(pat, "str"), cudf.Scalar(esc, "str") - ) + with acquire_spill_lock(): + plc_result = plc.strings.contains.like( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat, "str").device_value.c_value, + cudf.Scalar(esc, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) - return self._return_or_inplace(result_col) + return self._return_or_inplace(result) def repeat( self, @@ -901,17 +1002,16 @@ def repeat( 2 ccc dtype: object """ - if can_convert_to_column(repeats): - return self._return_or_inplace( - libstrings.repeat_sequence( - self._column, - column.as_column(repeats, dtype="int"), - ), + with acquire_spill_lock(): + if can_convert_to_column(repeats): + repeats = column.as_column(repeats, dtype="int").to_pylibcudf( + mode="read" + ) + plc_result = plc.strings.repeat.repeat_strings( + self._column.to_pylibcudf(mode="read"), repeats ) - - return self._return_or_inplace( - libstrings.repeat_scalar(self._column, repeats) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace( self, @@ -997,19 +1097,22 @@ def replace( "`pat` and `repl` are list-like inputs" ) - return self._return_or_inplace( - libstrings.replace_multi_re( - self._column, - list(pat), - column.as_column(repl, dtype="str"), + if regex: + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + list(pat), + column.as_column(repl, dtype="str").to_pylibcudf( + mode="read" + ), + ) + result = Column.from_pylibcudf(plc_result) + else: + result = self._column.replace_multiple( + cast(StringColumn, column.as_column(pat, dtype="str")), + cast(StringColumn, column.as_column(repl, dtype="str")), ) - if regex - else libstrings.replace_multi( - self._column, - column.as_column(pat, dtype="str"), - column.as_column(repl, dtype="str"), - ), - ) + return self._return_or_inplace(result) # Pandas treats 0 as all if n == 0: n = -1 @@ -1019,18 +1122,25 @@ def replace( pat = pat.pattern # Pandas forces non-regex replace when pat is a single-character - return self._return_or_inplace( - libstrings.replace_re( - self._column, pat, cudf.Scalar(repl, "str"), n - ) - if regex is True and len(pat) > 1 - else libstrings.replace( - self._column, - cudf.Scalar(pat, "str"), - cudf.Scalar(repl, "str"), - n, - ), - ) + with acquire_spill_lock(): + if regex is True and len(pat) > 1: + plc_result = plc.strings.replace_re.replace_re( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + cudf.Scalar(repl, "str").device_value.c_value, + n, + ) + else: + plc_result = plc.strings.replace.replace( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(pat).device_value.c_value, + cudf.Scalar(repl).device_value.c_value, + n, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: r""" @@ -1058,14 +1168,20 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: 1 ZV576 dtype: object """ - # If 'pat' is re.Pattern then get the pattern string from it if isinstance(pat, re.Pattern): pat = pat.pattern - return self._return_or_inplace( - libstrings.replace_with_backrefs(self._column, pat, repl) - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace_re.replace_with_backrefs( + self._column.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + repl, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice( self, @@ -1136,10 +1252,28 @@ def slice( 2 cm dtype: object """ + param_dtype = np.dtype(np.int32) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(start, param_dtype).device_value.c_value, + cudf.Scalar(stop, param_dtype).device_value.c_value, + cudf.Scalar(step, param_dtype).device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.slice_strings(self._column, start, stop, step), - ) + def _all_characters_of_type( + self, + char_type: plc.strings.char_types.StringCharacterTypes, + case_type: plc.strings.char_types.StringCharacterTypes = plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) -> SeriesOrIndex: + with acquire_spill_lock(): + plc_column = plc.strings.char_types.all_characters_of_type( + self._column.to_pylibcudf(mode="read"), char_type, case_type + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def isinteger(self) -> SeriesOrIndex: """ @@ -1396,7 +1530,9 @@ def isdecimal(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_decimal(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DECIMAL + ) def isalnum(self) -> SeriesOrIndex: """ @@ -1467,7 +1603,9 @@ def isalnum(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alnum(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHANUM + ) def isalpha(self) -> SeriesOrIndex: """ @@ -1525,7 +1663,9 @@ def isalpha(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_alpha(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.ALPHA + ) def isdigit(self) -> SeriesOrIndex: """ @@ -1589,7 +1729,9 @@ def isdigit(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_digit(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.DIGIT + ) def isnumeric(self) -> SeriesOrIndex: """ @@ -1659,7 +1801,9 @@ def isnumeric(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_numeric(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.NUMERIC + ) def isupper(self) -> SeriesOrIndex: """ @@ -1718,7 +1862,10 @@ def isupper(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_upper(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.UPPER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def islower(self) -> SeriesOrIndex: """ @@ -1777,7 +1924,10 @@ def islower(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_lower(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.LOWER, + plc.strings.char_types.StringCharacterTypes.CASE_TYPES, + ) def isipv4(self) -> SeriesOrIndex: """ @@ -1844,7 +1994,7 @@ def lower(self) -> SeriesOrIndex: 3 swapcase dtype: object """ - return self._return_or_inplace(libstrings.to_lower(self._column)) + return self._return_or_inplace(self._column.to_lower()) def upper(self) -> SeriesOrIndex: """ @@ -1895,7 +2045,7 @@ def upper(self) -> SeriesOrIndex: 3 SWAPCASE dtype: object """ - return self._return_or_inplace(libstrings.to_upper(self._column)) + return self._return_or_inplace(self._column.to_upper()) def capitalize(self) -> SeriesOrIndex: """ @@ -1923,7 +2073,7 @@ def capitalize(self) -> SeriesOrIndex: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace(libstrings.capitalize(self._column)) + return self._return_or_inplace(self._column.capitalize()) def swapcase(self) -> SeriesOrIndex: """ @@ -1970,7 +2120,7 @@ def swapcase(self) -> SeriesOrIndex: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace(libstrings.swapcase(self._column)) + return self._return_or_inplace(self._column.swapcase()) def title(self) -> SeriesOrIndex: """ @@ -2017,7 +2167,7 @@ def title(self) -> SeriesOrIndex: 3 Swapcase dtype: object """ - return self._return_or_inplace(libstrings.title(self._column)) + return self._return_or_inplace(self._column.title()) def istitle(self) -> SeriesOrIndex: """ @@ -2043,7 +2193,7 @@ def istitle(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_title(self._column)) + return self._return_or_inplace(self._column.is_title()) def filter_alphanum( self, repl: str | None = None, keep: bool = True @@ -2078,14 +2228,22 @@ def filter_alphanum( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.filter_alphanum( - self._column, cudf.Scalar(repl, "str"), keep - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.char_types.filter_characters_of_type( + self._column.to_pylibcudf(mode="read"), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES + if keep + else plc.strings.char_types.StringCharacterTypes.ALPHANUM, + cudf.Scalar(repl, "str").device_value.c_value, + plc.strings.char_types.StringCharacterTypes.ALPHANUM + if keep + else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result) def slice_from( - self, starts: "cudf.Series", stops: "cudf.Series" + self, starts: cudf.Series, stops: cudf.Series ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2122,14 +2280,14 @@ def slice_from( 1 re dtype: object """ - - return self._return_or_inplace( - libstrings.slice_from( - self._column, - column.as_column(starts), - column.as_column(stops), - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.slice.slice_strings( + self._column.to_pylibcudf(mode="read"), + starts._column.to_pylibcudf(mode="read"), + stops._column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def slice_replace( self, @@ -2217,11 +2375,15 @@ def slice_replace( if repl is None: repl = "" - return self._return_or_inplace( - libstrings.slice_replace( - self._column, start, stop, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_slice( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(repl, "str").device_value.c_value, + start, + stop, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ @@ -2266,12 +2428,7 @@ def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: 1 0123456789_ dtype: object """ - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.insert(self._column, start, cudf.Scalar(repl, "str")), - ) + return self.slice_replace(start, start, repl) def get(self, i: int = 0) -> SeriesOrIndex: """ @@ -2314,17 +2471,22 @@ def get(self, i: int = 0) -> SeriesOrIndex: 2 f dtype: object """ - - return self._return_or_inplace(libstrings.get(self._column, i)) + if i < 0: + next_index = i - 1 + step = -1 + else: + next_index = i + 1 + step = 1 + return self.slice(i, next_index, step) def get_json_object( self, - json_path, + json_path: str, *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False, - ): + allow_single_quotes: bool = False, + strip_quotes_from_single_strings: bool = True, + missing_fields_as_nulls: bool = False, + ) -> SeriesOrIndex: r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string @@ -2394,11 +2556,14 @@ def get_json_object( ), missing_fields_as_nulls=missing_fields_as_nulls, ) - return self._return_or_inplace( - libstrings.get_json_object( - self._column, cudf.Scalar(json_path, "str"), options + with acquire_spill_lock(): + plc_result = plc.json.get_json_object( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(json_path, "str").device_value.c_value, + options, ) - ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def split( self, @@ -2893,7 +3058,10 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: ) def pad( - self, width: int, side: str = "left", fillchar: str = " " + self, + width: int, + side: Literal["left", "both", "right"] = "left", + fillchar: str = " ", ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2974,10 +3142,15 @@ def pad( raise ValueError( "side has to be either one of {'left', 'right', 'both'}" ) - - return self._return_or_inplace( - libstrings.pad(self._column, width, fillchar, side) - ) + with acquire_spill_lock(): + plc_result = plc.strings.padding.pad( + self._column.to_pylibcudf(mode="read"), + width, + side, + fillchar, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def zfill(self, width: int) -> SeriesOrIndex: """ @@ -3043,7 +3216,12 @@ def zfill(self, width: int) -> SeriesOrIndex: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(libstrings.zfill(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.padding.zfill( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3100,22 +3278,7 @@ def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 --d--- dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.center(self._column, width, fillchar) - ) + return self.pad(width, "both", fillchar) def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3154,22 +3317,7 @@ def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.ljust(self._column, width, fillchar) - ) + return self.pad(width, "right", fillchar) def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ @@ -3208,22 +3356,21 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: 3 __ dtype: object """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) + return self.pad(width, "left", fillchar) - return self._return_or_inplace( - libstrings.rjust(self._column, width, fillchar) - ) + def _strip( + self, side: plc.string.side_type.SideType, to_strip: str | None = None + ) -> SeriesOrIndex: + if to_strip is None: + to_strip = "" + with acquire_spill_lock(): + plc_result = plc.strings.strip.strip( + self._column.to_pylibcudf(mode="read"), + side, + cudf.Scalar(to_strip, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3277,12 +3424,7 @@ def strip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.BOTH, to_strip) def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3324,12 +3466,7 @@ def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.LEFT, to_strip) def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" @@ -3379,12 +3516,7 @@ def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: 3 dtype: object """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.rstrip(self._column, cudf.Scalar(to_strip, "str")) - ) + return self._strip(plc.strings.side_type.SideType.RIGHT, to_strip) def wrap(self, width: int, **kwargs) -> SeriesOrIndex: r""" @@ -3478,7 +3610,12 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: "`break_on_hyphens`=False" ) - return self._return_or_inplace(libstrings.wrap(self._column, width)) + with acquire_spill_lock(): + plc_result = plc.strings.wrap.wrap( + self._column.to_pylibcudf(mode="read"), width + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: r""" @@ -3546,10 +3683,37 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: raise NotImplementedError( "unsupported value for `flags` parameter" ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.count_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) - return self._return_or_inplace( - libstrings.count_re(self._column, pat, flags) - ) + def _findall( + self, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram], plc.Column + ], + pat: str | re.Pattern, + flags: int = 0, + ) -> SeriesOrIndex: + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = method( + self._column.to_pylibcudf(mode="read"), + prog, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3616,16 +3780,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - data = libstrings.findall(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.findall, pat, flags) def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ @@ -3656,16 +3811,7 @@ def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: 3 2 dtype: int32 """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "Unsupported value for `flags` parameter" - ) - - data = libstrings.find_re(self._column, pat, flags) - return self._return_or_inplace(data) + return self._findall(plc.strings.findall.find_re, pat, flags) def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ @@ -3723,8 +3869,15 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: f"got: {patterns_column.dtype}" ) + with acquire_spill_lock(): + plc_result = plc.strings.find_multiple.find_multiple( + self._column.to_pylibcudf(mode="read"), + patterns_column.to_pylibcudf(mode="read"), + ) + result = Column.from_pylibcudf(plc_result) + return cudf.Series._from_column( - libstrings.find_multiple(self._column, patterns_column), + result, name=self._parent.name, index=self._parent.index if isinstance(self._parent, cudf.Series) @@ -3816,9 +3969,34 @@ def isspace(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_space(self._column)) + return self._all_characters_of_type( + plc.strings.char_types.StringCharacterTypes.SPACE + ) - def endswith(self, pat: str) -> SeriesOrIndex: + def _starts_ends_with( + self, + method: Callable[[plc.Column, plc.Column | plc.Scalar], plc.Column], + pat: str | Sequence, + ) -> SeriesOrIndex: + if pat is None: + raise TypeError( + f"expected a string or a sequence-like object, not " + f"{type(pat).__name__}" + ) + elif is_scalar(pat): + plc_pat = cudf.Scalar(pat, "str").device_value.c_value + else: + plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( + mode="read" + ) + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), plc_pat + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + + def endswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3860,21 +4038,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: `na` parameter is not yet supported, as cudf uses native strings instead of Python objects. """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.endswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.endswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.ends_with, pat) def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ @@ -3923,21 +4087,7 @@ def startswith(self, pat: str | Sequence) -> SeriesOrIndex: 3 dtype: bool """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.startswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.startswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) + return self._starts_ends_with(plc.strings.find.starts_with, pat) def removesuffix(self, suffix: str) -> SeriesOrIndex: """ @@ -3972,12 +4122,9 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: """ if suffix is None or len(suffix) == 0: return self._return_or_inplace(self._column) - ends_column = libstrings.endswith( - self._column, cudf.Scalar(suffix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, 0, -len(suffix), None - ) + ends_column = self.endswith(suffix)._column # type: ignore[union-attr] + removed_column = self.slice(0, -len(suffix), None)._column # type: ignore[union-attr] + result = cudf._lib.copying.copy_if_else( removed_column, self._column, ends_column ) @@ -4016,17 +4163,38 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: """ if prefix is None or len(prefix) == 0: return self._return_or_inplace(self._column) - starts_column = libstrings.startswith( - self._column, cudf.Scalar(prefix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, len(prefix), None, None - ) + starts_column = self.startswith(prefix)._column # type: ignore[union-attr] + removed_column = self.slice(len(prefix), None, None)._column # type: ignore[union-attr] result = cudf._lib.copying.copy_if_else( removed_column, self._column, starts_column ) return self._return_or_inplace(result) + def _find( + self, + method: Callable[[plc.Column, plc.Scalar, int, int], plc.Column], + sub: str, + start: int = 0, + end: int | None = None, + ) -> SeriesOrIndex: + if not isinstance(sub, str): + raise TypeError( + f"expected a string object, not {type(sub).__name__}" + ) + + if end is None: + end = -1 + + with acquire_spill_lock(): + plc_result = method( + self._column.to_pylibcudf(mode="read"), + cudf.Scalar(sub, "str").device_value.c_value, + start, + end, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) + def find( self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: @@ -4070,19 +4238,7 @@ def find( 3 2 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.find, sub, start, end) def rfind( self, sub: str, start: int = 0, end: int | None = None @@ -4131,19 +4287,7 @@ def rfind( 2 -1 dtype: int32 """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) + return self._find(plc.strings.find.rfind, sub, start, end) def index( self, sub: str, start: int = 0, end: int | None = None @@ -4196,9 +4340,7 @@ def index( if end is None: end = -1 - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.find(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4258,9 +4400,7 @@ def rindex( if end is None: end = -1 - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) + result_col = self.rfind(sub, start, end)._column # type: ignore[union-attr] result = self._return_or_inplace(result_col) @@ -4323,10 +4463,13 @@ def match( raise NotImplementedError( "unsupported value for `flags` parameter" ) - - return self._return_or_inplace( - libstrings.match_re(self._column, pat, flags) - ) + with acquire_spill_lock(): + prog = plc.strings.regex_program.RegexProgram.create(pat, flags) + plc_result = plc.strings.contains.matches_re( + self._column.to_pylibcudf(mode="read"), prog + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def url_decode(self) -> SeriesOrIndex: """ @@ -4420,9 +4563,12 @@ def code_points(self) -> SeriesOrIndex: 2 99 dtype: int32 """ - return self._return_or_inplace( - libstrings.code_points(self._column), retain_index=False - ) + with acquire_spill_lock(): + plc_column = plc.strings.attributes.code_points( + self._column.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return self._return_or_inplace(result, retain_index=False) def translate(self, table: dict) -> SeriesOrIndex: """ @@ -4465,9 +4611,12 @@ def translate(self, table: dict) -> SeriesOrIndex: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace( - libstrings.translate(self._column, table) - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.translate( + self._column.to_pylibcudf(mode="read"), table + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def filter_characters( self, table: dict, keep: bool = True, repl: str | None = None @@ -4516,11 +4665,17 @@ def filter_characters( if repl is None: repl = "" table = str.maketrans(table) - return self._return_or_inplace( - libstrings.filter_characters( - self._column, table, keep, cudf.Scalar(repl, "str") - ), - ) + with acquire_spill_lock(): + plc_result = plc.strings.translate.filter_characters( + self._column.to_pylibcudf(mode="read"), + table, + plc.strings.translate.FilterType.KEEP + if keep + else plc.strings.translate.FilterType.REMOVE, + cudf.Scalar(repl, "str").device_value.c_value, + ) + result = Column.from_pylibcudf(plc_result) + return self._return_or_inplace(result) def normalize_spaces(self) -> SeriesOrIndex: r""" @@ -5614,17 +5769,12 @@ def _massage_string_arg(value, name, allow_col=False): allowed_types.append("Column") - raise ValueError( - f"Expected {_expected_types_format(allowed_types)} " - f"for {name} but got {type(value)}" - ) - - -def _expected_types_format(types): - if len(types) == 1: - return types[0] + if len(allowed_types) == 1: + expected = allowed_types[0] + else: + expected = ", ".join(allowed_types[:-1]) + ", or " + allowed_types[-1] - return ", ".join(types[:-1]) + ", or " + types[-1] + raise ValueError(f"Expected {expected} for {name} but got {type(value)}") class StringColumn(column.ColumnBase): @@ -5844,11 +5994,13 @@ def sum( skipna=skipna, min_count=min_count ) if isinstance(result_col, type(self)): - return libstrings.join( - result_col, - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ).element_indexing(0) + with acquire_spill_lock(): + plc_column = plc.strings.combine.join_strings( + result_col.to_pylibcudf(mode="read"), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column).element_indexing(0) else: return result_col @@ -5897,13 +6049,12 @@ def strptime( ) is_nat = self == "NaT" without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) - all_same_length = ( - libstrings.count_characters(without_nat).distinct_count( - dropna=True + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + without_nat.to_pylibcudf(mode="read") ) - == 1 - ) - if not all_same_length: + char_counts = Column.from_pylibcudf(plc_column) + if char_counts.distinct_count(dropna=True) != 1: # Unfortunately disables OK cases like: # ["2020-01-01", "2020-01-01 00:00:00"] # But currently incorrect for cases like (drops 10): @@ -6104,14 +6255,18 @@ def _binaryop( rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) - return cast( - "column.ColumnBase", - libstrings.concatenate( - [lhs, rhs], - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ), - ) + with acquire_spill_lock(): + plc_column = plc.strings.combine.concatenate( + plc.Table( + [ + lhs.to_pylibcudf(mode="read"), + rhs.to_pylibcudf(mode="read"), + ] + ), + cudf.Scalar("").device_value.c_value, + cudf.Scalar(None, "str").device_value.c_value, + ) + return Column.from_pylibcudf(plc_column) elif op in { "__eq__", "__ne__", @@ -6151,52 +6306,39 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) - -def _get_cols_list(parent_obj, others): - parent_index = ( - parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj - ) - - if ( - can_convert_to_column(others) - and len(others) > 0 - and ( - can_convert_to_column( - others.iloc[0] - if isinstance(others, cudf.Series) - else others[0] - ) - ) - ): + def _modify_characters( + self, method: Callable[[plc.Column], plc.Column] + ) -> Self: """ - If others is a list-like object (in our case lists & tuples) - just another Series/Index, great go ahead with concatenation. + Helper function for methods that modify characters e.g. to_lower """ - cols_list = [ - column.as_column(frame.reindex(parent_index), dtype="str") - if ( - parent_index is not None - and isinstance(frame, cudf.Series) - and not frame.index.equals(parent_index) - ) - else column.as_column(frame, dtype="str") - for frame in others - ] + with acquire_spill_lock(): + plc_column = method(self.to_pylibcudf(mode="read")) + return cast(Self, Column.from_pylibcudf(plc_column)) - return cols_list - elif others is not None and not isinstance(others, StringMethods): - if ( - parent_index is not None - and isinstance(others, cudf.Series) - and not others.index.equals(parent_index) - ): - others = others.reindex(parent_index) + def to_lower(self) -> Self: + return self._modify_characters(plc.strings.case.to_lower) - return [column.as_column(others, dtype="str")] - else: - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) + def to_upper(self) -> Self: + return self._modify_characters(plc.strings.case.to_upper) + + def capitalize(self) -> Self: + return self._modify_characters(plc.strings.capitalize.capitalize) + + def swapcase(self) -> Self: + return self._modify_characters(plc.strings.case.swapcase) + + def title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.title) + + def is_title(self) -> Self: + return self._modify_characters(plc.strings.capitalize.is_title) + + def replace_multiple(self, pattern: Self, replacements: Self) -> Self: + with acquire_spill_lock(): + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 9a22045ff78..91f23490031 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -242,12 +242,11 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: """Handles empty and infinity strings""" - col = libstrings.to_lower(col) + col = col.to_lower() # type: ignore[attr-defined] col = col.find_and_replace(as_column([""]), as_column(["NaN"])) # TODO: This can be handled by libcudf in # future see StringColumn.as_numerical_column - col = libstrings.replace_multi( - col, + col = col.replace_multiple( # type: ignore[attr-defined] as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index e25f99d7bee..9700f548a16 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1272,7 +1272,7 @@ def test_string_slice_from(): gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) - got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) + got = gs.str.slice_from(starts=d_starts, stops=d_stops) expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) assert_eq(got, expected) From 6d8ec80c61468590a5909168576cf755738b7fec Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:32:09 -0800 Subject: [PATCH 13/14] Remove cudf._lib.quantile (#17424) Follow up to https://github.com/rapidsai/cudf/pull/17347 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17424 --- python/cudf/cudf/_lib/quantiles.pyx | 53 ----------------------------- 1 file changed, 53 deletions(-) delete mode 100644 python/cudf/cudf/_lib/quantiles.pyx diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx deleted file mode 100644 index 509cfe5e9f8..00000000000 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.vector cimport vector - -from cudf._lib.column cimport Column - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def quantile( - Column input, - vector[double] q, - str interp, - Column ordered_indices, - bool exact, -): - return Column.from_pylibcudf( - plc.quantiles.quantile( - input.to_pylibcudf(mode="read"), - q, - plc.types.Interpolation[interp.upper()], - ordered_indices.to_pylibcudf(mode="read"), - exact - ) - ) - - -def quantile_table( - list source_columns, - vector[double] q, - object interp, - object is_input_sorted, - list column_order, - list null_precedence, -): - return columns_from_pylibcudf_table( - plc.quantiles.quantiles( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - q, - interp, - is_input_sorted, - column_order, - null_precedence - ) - ) From 83f0ae02663e036f5fa52124561f36c646ae0918 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 27 Nov 2024 00:31:01 -0600 Subject: [PATCH 14/14] Fix write_json failure for zero columns in table/struct (#17414) Closes #17413 num_rows are passed to ensure empty`{}` is created for zero columns. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17414 --- cpp/src/io/json/write_json.cu | 64 +++++++++++++++++++---------- python/cudf/cudf/tests/test_json.py | 6 +++ 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 8156258c810..a4885d59cc5 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -244,6 +244,7 @@ struct validity_fn { * * @param strings_columns Table of strings columns * @param column_names Column of names for each column in the table + * @param num_rows Number of rows in the table * @param row_prefix Prepend this string to each row * @param row_suffix Append this string to each row * @param value_separator Separator between values @@ -255,6 +256,7 @@ struct validity_fn { */ std::unique_ptr struct_to_strings(table_view const& strings_columns, column_view const& column_names, + size_type const num_rows, string_view const row_prefix, string_view const row_suffix, string_view const value_separator, @@ -268,8 +270,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, auto const num_columns = strings_columns.num_columns(); CUDF_EXPECTS(num_columns == column_names.size(), "Number of column names should be equal to number of columns in the table"); - auto const strings_count = strings_columns.num_rows(); - if (strings_count == 0) // empty begets empty + if (num_rows == 0) // empty begets empty return make_empty_column(type_id::STRING); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), @@ -277,31 +278,46 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, [](auto const& c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); auto constexpr strviews_per_column = 3; // (for each "column_name:", "value", "separator") - auto const num_strviews_per_row = strings_columns.num_columns() * strviews_per_column + 1; + auto const num_strviews_per_row = strings_columns.num_columns() == 0 + ? 2 + : (1 + strings_columns.num_columns() * strviews_per_column); // e.g. {col1: value, col2: value, col3: value} = 1 + 3 + 3 + (3-1) + 1 = 10 auto tbl_device_view = cudf::table_device_view::create(strings_columns, stream); auto d_column_names = column_device_view::create(column_names, stream); // Note for future: chunk it but maximize parallelism, if memory usage is high. - auto const total_strings = num_strviews_per_row * strings_columns.num_rows(); - auto const total_rows = strings_columns.num_rows() * strings_columns.num_columns(); + auto const total_strings = num_strviews_per_row * num_rows; + auto const total_rows = num_rows * strings_columns.num_columns(); rmm::device_uvector d_strviews(total_strings, stream); - struct_scatter_strings_fn scatter_fn{*tbl_device_view, - *d_column_names, - strviews_per_column, - num_strviews_per_row, - row_prefix, - row_suffix, - value_separator, - narep.value(stream), - include_nulls, - d_strviews.begin()}; - // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(total_rows), - scatter_fn); + if (strings_columns.num_columns() > 0) { + struct_scatter_strings_fn scatter_fn{*tbl_device_view, + *d_column_names, + strviews_per_column, + num_strviews_per_row, + row_prefix, + row_suffix, + value_separator, + narep.value(stream), + include_nulls, + d_strviews.begin()}; + // scatter row_prefix, row_suffix, column_name:, value, value_separator as string_views + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(total_rows), + scatter_fn); + } else { + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [d_strviews = d_strviews.begin(), row_prefix, row_suffix, num_strviews_per_row] __device__( + auto idx) { + auto const this_index = idx * num_strviews_per_row; + d_strviews[this_index] = row_prefix; + d_strviews[this_index + num_strviews_per_row - 1] = row_suffix; + }); + } if (!include_nulls) { // if previous column was null, then we skip the value separator rmm::device_uvector d_str_separator(total_rows, stream); @@ -341,7 +357,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, // gather from offset and create a new string column auto old_offsets = strings_column_view(joined_col->view()).offsets(); - rmm::device_uvector row_string_offsets(strings_columns.num_rows() + 1, stream, mr); + rmm::device_uvector row_string_offsets(num_rows + 1, stream, mr); auto const d_strview_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type([num_strviews_per_row] __device__(size_type const i) { return i * num_strviews_per_row; @@ -353,7 +369,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, row_string_offsets.begin()); auto chars_data = joined_col->release().data; return make_strings_column( - strings_columns.num_rows(), + num_rows, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), std::move(chars_data.release()[0]), 0, @@ -677,6 +693,7 @@ struct column_to_strings_fn { auto col_string = operator()(child_it, child_it + column.num_children(), children_names, + column.size(), struct_row_end_wrap.value(stream_)); col_string->set_null_mask(cudf::detail::copy_bitmask(column, stream_, mr_), column.null_count()); @@ -688,6 +705,7 @@ struct column_to_strings_fn { std::unique_ptr operator()(column_iterator column_begin, column_iterator column_end, host_span children_names, + size_type num_rows, cudf::string_view const row_end_wrap_value) const { auto const num_columns = std::distance(column_begin, column_end); @@ -733,6 +751,7 @@ struct column_to_strings_fn { // return struct_to_strings(str_table_view, column_names_view, + num_rows, struct_row_begin_wrap.value(stream_), row_end_wrap_value, struct_value_separator.value(stream_), @@ -908,6 +927,7 @@ void write_json_uncompressed(data_sink* out_sink, auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names, + sub_view.num_rows(), d_line_terminator_with_row_end.value(stream)); // Needs line_terminator at the end, to separate from next chunk diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 47976fc4bac..b48be6b2c2f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -277,6 +277,12 @@ def test_cudf_json_writer_read(gdf_writer_types): """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", """{"a":{"L": [{}, {}]}, "b":1.1}\n""", ), + # empty structs + ("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""), + ( + """{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""", + """{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""", + ), ], ) def test_cudf_json_roundtrip(jsonl_string, expected):