diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index ecf2f610697..de53e7586cd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -18,11 +18,11 @@ #include #include +#include #include #include #include -#include -#include +#include #include #include @@ -30,8 +30,17 @@ #include +#include +#include + namespace cudf { namespace detail { +template +constexpr bool is_product_supported() +{ + return is_numeric(); +} + /** * @brief Maps an `aggregation::Kind` value to it's corresponding binary * operator. @@ -113,465 +122,6 @@ constexpr bool has_corresponding_operator() return !std::is_same_v::type, void>; } -template -struct update_target_element { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - CUDF_UNREACHABLE("Invalid source type and aggregation combination."); - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_min(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !is_fixed_point()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::MAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_max(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && cudf::has_atomic_support() && - !cudf::is_fixed_point() && !cudf::is_timestamp()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::SUM, - target_has_nulls, - source_has_nulls, - std::enable_if_t() && - cudf::has_atomic_support>()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - using DeviceTarget = device_storage_type_t; - using DeviceSource = device_storage_type_t; - - cudf::detail::atomic_add(&target.element(target_index), - static_cast(source.element(source_index))); - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column using - * the dictionary key addressed by the specific index. - * - * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a - * dictionary. - * - */ -template -struct update_target_from_dictionary { - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } - template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - } -}; - -/** - * @brief Specialization function for dictionary type and aggregations. - * - * The `source` column is a dictionary type. This functor de-references the - * dictionary's keys child column and maps the input source index through - * the dictionary's indices child column to pass to the `update_target_element` - * in the above `update_target_from_dictionary` using the type-dispatcher to - * resolve the keys column type. - * - * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct update_target_element< - dictionary32, - k, - target_has_nulls, - source_has_nulls, - std::enable_if_t> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - dispatch_type_and_aggregation( - source.child(cudf::dictionary_column_view::keys_column_index).type(), - k, - update_target_from_dictionary{}, - target, - target_index, - source.child(cudf::dictionary_column_view::keys_column_index), - static_cast(source.element(source_index))); - } -}; - -template -constexpr bool is_product_supported() -{ - return is_numeric(); -} - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto value = static_cast(source.element(source_index)); - cudf::detail::atomic_add(&target.element(target_index), value * value); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_mul(&target.element(target_index), - static_cast(source.element(source_index))); - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_VALID, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_VALID is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::COUNT_ALL, - target_has_nulls, - source_has_nulls, - std::enable_if_t()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - using Target = target_type_t; - cudf::detail::atomic_add(&target.element(target_index), Target{1}); - - // It is assumed the output for COUNT_ALL is initialized to be all valid - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMAX, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMAX_SENTINEL, source_index); - if (old != ARGMAX_SENTINEL) { - while (source.element(source_index) > source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -template -struct update_target_element< - Source, - aggregation::ARGMIN, - target_has_nulls, - source_has_nulls, - std::enable_if_t() and - cudf::is_relationally_comparable()>> { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - if (source_has_nulls and source.is_null(source_index)) { return; } - - using Target = target_type_t; - auto old = cudf::detail::atomic_cas( - &target.element(target_index), ARGMIN_SENTINEL, source_index); - if (old != ARGMIN_SENTINEL) { - while (source.element(source_index) < source.element(old)) { - old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); - } - } - - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } - } -}; - -/** - * @brief Function object to update a single element in a target column by - * performing an aggregation operation with a single element from a source - * column. - * - * @tparam target_has_nulls Indicates presence of null elements in `target` - * @tparam source_has_nulls Indicates presence of null elements in `source`. - */ -template -struct elementwise_aggregator { - template - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - update_target_element{}( - target, target_index, source, source_index); - } -}; - -/** - * @brief Updates a row in `target` by performing elementwise aggregation - * operations with a row in `source`. - * - * For the row in `target` specified by `target_index`, each element at `i` is - * updated by: - * ```c++ - * target_row[i] = aggs[i](target_row[i], source_row[i]) - * ``` - * - * This function only supports aggregations that can be done in a "single pass", - * i.e., given an initial value `R`, the aggregation `op` can be computed on a series - * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order - * of the values of `i`. - * - * The initial value and validity of `R` depends on the aggregation: - * SUM: 0 and NULL - * MIN: Max value of type and NULL - * MAX: Min value of type and NULL - * COUNT_VALID: 0 and VALID - * COUNT_ALL: 0 and VALID - * ARGMAX: `ARGMAX_SENTINEL` and NULL - * ARGMIN: `ARGMIN_SENTINEL` and NULL - * - * It is required that the elements of `target` be initialized with the corresponding - * initial values and validity specified above. - * - * Handling of null elements in both `source` and `target` depends on the aggregation: - * SUM, MIN, MAX, ARGMIN, ARGMAX: - * - `source`: Skipped - * - `target`: Updated from null to valid upon first successful aggregation - * COUNT_VALID, COUNT_ALL: - * - `source`: Skipped - * - `target`: Cannot be null - * - * @param target Table containing the row to update - * @param target_index Index of the row to update in `target` - * @param source Table containing the row used to update the row in `target`. - * The invariant `source.num_columns() >= target.num_columns()` must hold. - * @param source_index Index of the row to use in `source` - * @param aggs Array of aggregations to perform between elements of the `target` - * and `source` rows. Must contain at least `target.num_columns()` valid - * `aggregation::Kind` values. - */ -template -__device__ inline void aggregate_row(mutable_table_device_view target, - size_type target_index, - table_device_view source, - size_type source_index, - aggregation::Kind const* aggs) -{ - for (auto i = 0; i < target.num_columns(); ++i) { - dispatch_type_and_aggregation(source.column(i).type(), - aggs[i], - elementwise_aggregator{}, - target.column(i), - target_index, - source.column(i), - source_index); - } -} - /** * @brief Dispatched functor to initialize a column with the identity of an * aggregation operation. diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh new file mode 100644 index 00000000000..10be5e1d36f --- /dev/null +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::detail { +template +struct update_target_element { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MIN, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !is_fixed_point()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::MAX, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_fixed_point() && !cudf::is_timestamp()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::SUM, + cuda::std::enable_if_t() && + cudf::has_atomic_support>()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + using DeviceTarget = device_storage_type_t; + using DeviceSource = device_storage_type_t; + + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source.element(source_index))); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column using + * the dictionary key addressed by the specific index. + * + * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a + * dictionary. + * + */ +struct update_target_from_dictionary { + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + update_target_element{}(target, target_index, source, source_index); + } + template ()>* = nullptr> + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + } +}; + +/** + * @brief Specialization function for dictionary type and aggregations. + * + * The `source` column is a dictionary type. This functor de-references the + * dictionary's keys child column and maps the input source index through + * the dictionary's indices child column to pass to the `update_target_element` + * in the above `update_target_from_dictionary` using the type-dispatcher to + * resolve the keys column type. + * + * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )` + */ +template +struct update_target_element< + dictionary32, + k, + cuda::std::enable_if_t> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + dispatch_type_and_aggregation( + source.child(cudf::dictionary_column_view::keys_column_index).type(), + k, + update_target_from_dictionary{}, + target, + target_index, + source.child(cudf::dictionary_column_view::keys_column_index), + static_cast(source.element(source_index))); + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target.element(target_index), value * value); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source.element(source_index))); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_VALID, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::COUNT_ALL, + cuda::std::enable_if_t()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + using Target = target_type_t; + cudf::detail::atomic_add(&target.element(target_index), Target{1}); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMAX_SENTINEL, source_index); + if (old != ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element< + Source, + aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + if (source.is_null(source_index)) { return; } + + using Target = target_type_t; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), ARGMIN_SENTINEL, source_index); + if (old != ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target.element(target_index), old, source_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief Function object to update a single element in a target column by + * performing an aggregation operation with a single element from a source + * column. + */ +struct elementwise_aggregator { + template + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + update_target_element{}(target, target_index, source, source_index); + } +}; + +/** + * @brief Updates a row in `target` by performing elementwise aggregation + * operations with a row in `source`. + * + * For the row in `target` specified by `target_index`, each element at `i` is + * updated by: + * ```c++ + * target_row[i] = aggs[i](target_row[i], source_row[i]) + * ``` + * + * This function only supports aggregations that can be done in a "single pass", + * i.e., given an initial value `R`, the aggregation `op` can be computed on a series + * of elements `e[i] for i in [0,n)` by computing `R = op(e[i],R)` for any order + * of the values of `i`. + * + * The initial value and validity of `R` depends on the aggregation: + * SUM: 0 and NULL + * MIN: Max value of type and NULL + * MAX: Min value of type and NULL + * COUNT_VALID: 0 and VALID + * COUNT_ALL: 0 and VALID + * ARGMAX: `ARGMAX_SENTINEL` and NULL + * ARGMIN: `ARGMIN_SENTINEL` and NULL + * + * It is required that the elements of `target` be initialized with the corresponding + * initial values and validity specified above. + * + * Handling of null elements in both `source` and `target` depends on the aggregation: + * SUM, MIN, MAX, ARGMIN, ARGMAX: + * - `source`: Skipped + * - `target`: Updated from null to valid upon first successful aggregation + * COUNT_VALID, COUNT_ALL: + * - `source`: Skipped + * - `target`: Cannot be null + * + * @param target Table containing the row to update + * @param target_index Index of the row to update in `target` + * @param source Table containing the row used to update the row in `target`. + * The invariant `source.num_columns() >= target.num_columns()` must hold. + * @param source_index Index of the row to use in `source` + * @param aggs Array of aggregations to perform between elements of the `target` + * and `source` rows. Must contain at least `target.num_columns()` valid + * `aggregation::Kind` values. + */ +__device__ inline void aggregate_row(mutable_table_device_view target, + size_type target_index, + table_device_view source, + size_type source_index, + aggregation::Kind const* aggs) +{ + for (auto i = 0; i < target.num_columns(); ++i) { + dispatch_type_and_aggregation(source.column(i).type(), + aggs[i], + elementwise_aggregator{}, + target.column(i), + target_index, + source.column(i), + source_index); + } +} +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 80a4460023f..4295f5e6ddd 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,28 +143,30 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest column. + * @brief Create a tdigest column of empty tdigests. * - * An empty tdigest column contains a single row of length 0 + * The column created contains the specified number of rows of empty tdigests. * + * @param num_rows The number of rows in the output column. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest column. + * @returns A tdigest column of empty clusters. */ CUDF_EXPORT -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create an empty tdigest scalar. + * @brief Create a scalar of an empty tdigest cluster. * - * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 + * The returned scalar is a struct_scalar that contains a single row of an empty cluster. * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns An empty tdigest scalar. + * @returns A scalar of an empty tdigest cluster. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -25,33 +26,82 @@ namespace detail { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async_impl( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} /** - * @brief Synchronously copies data between the host and device. + * @brief Asynchronously copies data from device to host memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination host memory + * @param src Source device memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} + +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 3ebe5cb53e9..f229facca08 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -30,7 +30,7 @@ namespace strings { */ /** - * @brief Returns a boolean column identifying strings entries in which all + * @brief Returns a boolean column identifying string entries where all * characters are of the type specified. * * The output row entry will be set to false if the corresponding string element @@ -105,7 +105,8 @@ std::unique_ptr all_characters_of_type( * `types_to_remove` will be filtered. * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches - * @return New column of boolean results for each string + * @return New strings column with the characters of specified types filtered out and replaced by + * the specified replacement string */ std::unique_ptr filter_characters_of_type( strings_column_view const& input, diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index 1758790cd64..c259d61060b 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 02998b84ffd..d915c85bf85 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,13 @@ */ #include +#include +#include #include +#include + namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh index 9abfe22950a..188d0cff3f1 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/groupby_kernels.cuh @@ -18,8 +18,8 @@ #include "multi_pass_kernels.cuh" -#include #include +#include #include #include @@ -100,7 +100,7 @@ struct compute_single_pass_aggs_fn { if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { auto const result = set.insert_and_find(i); - cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); + cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } } }; diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 2358f47bbbb..f9adfc6060e 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 5855f1b5a5f..f7e8134b68d 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -634,11 +634,8 @@ std::pair, hashmap_of_device_columns> build_tree is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async( + d_column_tree.node_categories, column_categories, stream); } // ignore all children of columns forced as string @@ -653,11 +650,7 @@ std::pair, hashmap_of_device_columns> build_tree forced_as_string_column[this_col_id]) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, column_categories, stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index bf81162a0ac..76816071d8c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -620,10 +620,12 @@ struct PdaSymbolToSymbolGroupId { // We map the delimiter character to LINE_BREAK symbol group id, and the newline character // to WHITE_SPACE. Note that delimiter cannot be any of opening(closing) brace, bracket, quote, // escape, comma, colon or whitespace characters. + auto constexpr newline = '\n'; + auto constexpr whitespace = ' '; auto const symbol_position = symbol == delimiter - ? static_cast('\n') - : (symbol == '\n' ? static_cast(' ') : static_cast(symbol)); + ? static_cast(newline) + : (symbol == newline ? static_cast(whitespace) : static_cast(symbol)); PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)]; return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..634e6d78ebc 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,23 +125,17 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 0d017cf1f13..43c3b0a291b 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,32 +292,33 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_empty_tdigests_column(size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto min_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = - cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); + auto max_col = cudf::make_numeric_column( + data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(1, - make_empty_column(type_id::FLOAT64), - make_empty_column(type_id::FLOAT64), + return make_tdigest_column(num_rows, + cudf::make_empty_column(type_id::FLOAT64), + cudf::make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -338,7 +339,7 @@ std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_empty_tdigest_column(stream, mr)->release(); + auto contents = make_empty_tdigests_column(1, stream, mr)->release(); return std::make_unique( std::move(*std::make_unique(std::move(contents.children))), true, stream, mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index e1c1d2e3002..b0a84a6d50c 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -169,19 +169,19 @@ struct nearest_value_scalar_weights { */ template struct nearest_value_centroid_weights { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; // groups - size_type const* inner_offsets; // tdigests within a group + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; // groups + size_type const* tdigest_offsets; // tdigests within a group thrust::pair operator() __device__(double next_limit, size_type group_index) const { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - auto const num_weights = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + auto const num_weights = tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; // NOTE: as it is today, this functor will never be called for any digests that are empty, but // I'll leave this check here for safety. if (num_weights == 0) { return thrust::pair{0, 0}; } - double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin]; + double const* group_cumulative_weights = cumulative_weights + tdigest_offsets[tdigest_begin]; auto const index = ((thrust::lower_bound(thrust::seq, group_cumulative_weights, @@ -235,21 +235,26 @@ struct cumulative_scalar_weight { */ template struct cumulative_centroid_weight { - double const* cumulative_weights; - GroupLabelsIter group_labels; - GroupOffsetsIter outer_offsets; // groups - cudf::device_span inner_offsets; // tdigests with a group - + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupLabelsIter group_labels; // group labels for each tdigest including empty ones + GroupOffsetsIter group_offsets; // groups + cudf::device_span tdigest_offsets; // tdigests with a group + + /** + * @brief Returns the cumulative weight for a given value index. The index `n` is the index of + * `n`-th non-empty cluster. + */ std::tuple operator() __device__(size_type value_index) const { auto const tdigest_index = static_cast( - thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) - - inner_offsets.begin()) - + thrust::upper_bound( + thrust::seq, tdigest_offsets.begin(), tdigest_offsets.end(), value_index) - + tdigest_offsets.begin()) - 1; auto const group_index = group_labels[tdigest_index]; - auto const first_tdigest_index = outer_offsets[group_index]; - auto const first_weight_index = inner_offsets[first_tdigest_index]; + auto const first_tdigest_index = group_offsets[group_index]; + auto const first_weight_index = tdigest_offsets[first_tdigest_index]; auto const relative_value_index = value_index - first_weight_index; double const* group_cumulative_weights = cumulative_weights + first_weight_index; @@ -284,15 +289,15 @@ struct scalar_group_info { // retrieve group info of centroid inputs by group index template struct centroid_group_info { - double const* cumulative_weights; - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; + double const* cumulative_weights; // cumulative weights of non-empty clusters + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ thrust::tuple operator()(size_type group_index) const { // if there's no weights in this group of digests at all, return 0. - auto const group_start = inner_offsets[outer_offsets[group_index]]; - auto const group_end = inner_offsets[outer_offsets[group_index + 1]]; + auto const group_start = tdigest_offsets[group_offsets[group_index]]; + auto const group_end = tdigest_offsets[group_offsets[group_index + 1]]; auto const num_weights = group_end - group_start; auto const last_weight_index = group_end - 1; return num_weights == 0 @@ -367,7 +372,6 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters * @param has_nulls Whether or not the input contains nulls - * */ template @@ -661,6 +665,10 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } +/** + * @brief A functor which returns the cluster index within a group that the value at + * the given value index falls into. + */ template struct compute_tdigests_keys_fn { int const delta; @@ -706,8 +714,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param values_begin Beginning of the range of input values. - * @param values_end End of the range of input values. + * @param centroids_begin Beginning of the range of centroids. + * @param centroids_end End of the range of centroids. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -750,7 +758,9 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (total_clusters == 0) { + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); + } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -983,38 +993,54 @@ struct typed_reduce_tdigest { } }; -// utility for merge_tdigests. +/** + * @brief Functor to compute the number of clusters in each group. + * + * Used in `merge_tdigests`. + */ template -struct group_num_weights_func { - GroupOffsetsIter outer_offsets; - size_type const* inner_offsets; +struct group_num_clusters_func { + GroupOffsetsIter group_offsets; + size_type const* tdigest_offsets; __device__ size_type operator()(size_type group_index) { - auto const tdigest_begin = outer_offsets[group_index]; - auto const tdigest_end = outer_offsets[group_index + 1]; - return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin]; + auto const tdigest_begin = group_offsets[group_index]; + auto const tdigest_end = group_offsets[group_index + 1]; + return tdigest_offsets[tdigest_end] - tdigest_offsets[tdigest_begin]; } }; -// utility for merge_tdigests. +/** + * @brief Function to determine if a group is empty. + * + * Used in `merge_tdigests`. + */ struct group_is_empty { __device__ bool operator()(size_type group_size) { return group_size == 0; } }; -// utility for merge_tdigests. +/** + * @brief Functor that returns the grouping key for each tdigest cluster. + * + * Used in `merge_tdigests`. + */ template struct group_key_func { GroupLabelsIter group_labels; - size_type const* inner_offsets; - size_type num_inner_offsets; + size_type const* tdigest_offsets; + size_type num_tdigest_offsets; + /** + * @brief Returns the group index for an absolute cluster index. The index `n` is the index of the + * `n`-th non-empty cluster. + */ __device__ size_type operator()(size_type index) { // what -original- tdigest index this absolute index corresponds to - auto const iter = thrust::prev( - thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index)); - auto const tdigest_index = thrust::distance(inner_offsets, iter); + auto const iter = thrust::prev(thrust::upper_bound( + thrust::seq, tdigest_offsets, tdigest_offsets + num_tdigest_offsets, index)); + auto const tdigest_index = thrust::distance(tdigest_offsets, iter); // what group index the original tdigest belongs to return group_labels[tdigest_index]; @@ -1040,8 +1066,8 @@ std::pair, rmm::device_uvector> generate_mer // each group represents a collection of tdigest columns. each row is 1 tdigest. // within each group, we want to sort all the centroids within all the tdigests - // in that group, using the means as the key. the "outer offsets" represent the indices of the - // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // in that group, using the means as the key. the "group offsets" represent the indices of the + // tdigests, and the "tdigest offsets" represents the list of centroids for a particular tdigest. // // rows // ---- centroid 0 --------- @@ -1054,12 +1080,12 @@ std::pair, rmm::device_uvector> generate_mer // tdigest 3 centroid 7 // centroid 8 // ---- centroid 9 -------- - auto inner_offsets = tdv.centroids().offsets(); + auto tdigest_offsets = tdv.centroids().offsets(); auto centroid_offsets = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type( - [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( - size_type i) { return inner_offsets[group_offsets[i]]; })); + [group_offsets, tdigest_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return tdigest_offsets[group_offsets[i]]; })); // perform the sort using the means as the key size_t temp_size; @@ -1091,9 +1117,34 @@ std::pair, rmm::device_uvector> generate_mer return {std::move(output_means), std::move(output_weights)}; } +/** + * @brief Perform a merge aggregation of tdigests. This function usually takes the input as the + * outputs of multiple `typed_group_tdigest` calls, and merges them. + * + * A tdigest can be empty in the input, which means that there was no valid input data to generate + * it. These empty tdigests will have no centroids (means or weights) and will have a `min` and + * `max` of 0. + * + * @param tdv input tdigests. The tdigests within this column are grouped by key. + * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `group_offsets`. + * @param group_offsets a device iterator of the offsets to the start of each group. A group is + * counted as one even when the cluster is empty in it. The offsets should have the same values as + * the ones in `h_group_offsets`. + * @param group_labels a device iterator of the the group label for each tdigest cluster including + * empty clusters. + * @param num_group_labels the number of unique group labels. + * @param num_groups the number of groups. + * @param max_centroids the maximum number of centroids (clusters) in the output (merged) tdigest. + * @param stream CUDA stream + * @param mr device memory resource + * + * @return A column containing the merged tdigests. + */ template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_outer_offsets, + HGroupOffsetIter h_group_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1133,22 +1184,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, thrust::equal_to{}, // key equality check thrust::maximum{}); + auto tdigest_offsets = tdv.centroids().offsets(); + // for any empty groups, set the min and max to be 0. not technically necessary but it makes // testing simpler. - auto group_num_weights = cudf::detail::make_counting_transform_iterator( + auto group_num_clusters = cudf::detail::make_counting_transform_iterator( 0, - group_num_weights_func{group_offsets, - tdv.centroids().offsets().begin()}); + group_num_clusters_func{group_offsets, + tdigest_offsets.begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); thrust::replace_if(rmm::exec_policy(stream), merged_max_col->mutable_view().begin(), merged_max_col->mutable_view().end(), - group_num_weights, + group_num_clusters, group_is_empty{}, 0); @@ -1166,14 +1219,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, // generate group keys for all centroids in the entire column rmm::device_uvector group_keys(num_centroids, stream, temp_mr); - auto iter = thrust::make_counting_iterator(0); - auto inner_offsets = tdv.centroids().offsets(); + auto iter = thrust::make_counting_iterator(0); thrust::transform(rmm::exec_policy(stream), iter, iter + num_centroids, group_keys.begin(), group_key_func{ - group_labels, inner_offsets.begin(), inner_offsets.size()}); + group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), group_keys.begin(), group_keys.begin() + num_centroids, @@ -1182,20 +1234,24 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto const delta = max_centroids; + // TDigest merge takes the output of typed_group_tdigest as its input, which must not have + // any nulls. + auto const has_nulls = false; + // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, centroid_group_info{ - cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + cumulative_weights.begin(), group_offsets, tdigest_offsets.begin()}, cumulative_centroid_weight{ cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, - false, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + has_nulls, stream, mr); @@ -1212,13 +1268,13 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, cumulative_weights.begin(), group_labels, group_offsets, - {inner_offsets.begin(), static_cast(inner_offsets.size())}}, + {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - false, + has_nulls, stream, mr); } @@ -1283,7 +1339,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1309,7 +1365,15 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); + return cudf::tdigest::detail::make_empty_tdigests_column(1, stream, mr); + } + + if (tdv.means().size() == 0) { + // `group_merge_tdigest` takes the output of `typed_group_tdigest` as its input, which wipes + // out the means and weights for empty clusters. Thus, no mean here indicates that all clusters + // are empty in the input. Let's skip all complex computation in the below, but just return + // an empty tdigest per group. + return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); } // bring group offsets back to the host diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -30,7 +30,7 @@ namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index baa59026b07..4ae5d06b214 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,3 +507,126 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } + +std::unique_ptr do_agg( + cudf::column_view key, + cudf::column_view val, + std::function()> make_agg) +{ + std::vector keys; + keys.push_back(key); + cudf::table_view const key_table(keys); + + cudf::groupby::groupby gb(key_table); + std::vector requests; + cudf::groupby::aggregation_request req; + req.values = val; + req.aggregations.push_back(make_agg()); + requests.push_back(std::move(req)); + + auto result = gb.aggregate(std::move(requests)); + + std::vector> result_columns; + for (auto&& c : result.first->release()) { + result_columns.push_back(std::move(c)); + } + + EXPECT_EQ(result.second.size(), 1); + EXPECT_EQ(result.second[0].results.size(), 1); + result_columns.push_back(std::move(result.second[0].results[0])); + + return std::make_unique(std::move(result_columns)); +} + +TEST_F(TDigestMergeTest, AllValuesAreNull) +{ + // The input must be sorted by the key. + // See `aggregate_result_functor::operator()` for details. + auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; + auto const keys_view = cudf::column_view(keys); + auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + // All values are null + return false; + }); + auto const vals = cudf::test::fixed_width_column_wrapper{ + val_elems, val_elems + keys_view.size(), val_valids}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_computed_keys_view{expected_computed_keys}; + auto const expected_computed_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_computed_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), + compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; + cudf::column_view const expected_merged_keys_view{expected_merged_keys}; + auto const expected_merged_vals = + cudf::tdigest::detail::make_empty_tdigests_column(expected_merged_keys_view.size(), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); +} + +TEST_F(TDigestMergeTest, AllValuesInOneGroupIsNull) +{ + cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper vals{{10.0, 20.0, {}, {}, 30.0}, + {true, true, false, false, true}}; + + auto const delta = 1000; + + // Compute tdigest. The result should have 3 empty clusters, one per group. + auto const compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() { + return cudf::make_tdigest_aggregation(delta); + }); + + auto const expected_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2, 3}}; + + cudf::test::fixed_width_column_wrapper expected_means{10, 20, 30}; + cudf::test::fixed_width_column_wrapper expected_weights{1, 1, 1}; + cudf::test::fixed_width_column_wrapper expected_offsets{0, 1, 2, 2, 3}; + cudf::test::fixed_width_column_wrapper expected_mins{10.0, 20.0, 0.0, 30.0}; + cudf::test::fixed_width_column_wrapper expected_maxes{10.0, 20.0, 0.0, 30.0}; + auto const expected_values = + cudf::tdigest::detail::make_tdigest_column(4, + std::make_unique(expected_means), + std::make_unique(expected_weights), + std::make_unique(expected_offsets), + std::make_unique(expected_mins), + std::make_unique(expected_maxes), + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + compute_result->get_column(0).view()); + // The computed values are nullable even though the input values are not. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), compute_result->get_column(1).view()); + + // Merge tdigest. The result should have 3 empty clusters, one per group. + auto const merge_result = + do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { + return cudf::make_merge_tdigest_aggregation(delta); + }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::column_view{expected_keys}, + merge_result->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_values->view(), merge_result->get_column(1).view()); +} diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 915717713df..37414eb3fba 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( - cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_empty_tdigests_column( + 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input; diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..e73ea3370ec 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -9,10 +9,12 @@ strings contains extract find + find_multiple findall regex_flags regex_program repeat replace slice + split strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 376a6f8af97..a57ce29eb45 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,23 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.char_types cimport ( - all_characters_of_type as cpp_all_characters_of_type, - filter_characters_of_type as cpp_filter_characters_of_type, - string_character_types, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf.strings import char_types @acquire_spill_lock() @@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_column = char_types.filter_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALL_TYPES if keep + else char_types.StringCharacterTypes.ALPHANUM, + py_repl.device_value.c_value, + char_types.StringCharacterTypes.ALPHANUM if keep + else char_types.StringCharacterTypes.ALL_TYPES ) - - with nogil: - c_result = move(cpp_filter_characters_of_type( - source_view, - string_character_types.ALL_TYPES if keep - else string_character_types.ALPHANUM, - scalar_repl[0], - string_character_types.ALPHANUM if keep - else string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -54,17 +32,12 @@ def is_decimal(Column source_strings): that contain only decimal characters -- those that can be used to extract base10 numbers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DECIMAL, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DECIMAL, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -75,17 +48,12 @@ def is_alnum(Column source_strings): Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHANUM, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -94,17 +62,12 @@ def is_alpha(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only alphabetic characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHA, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHA, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -113,17 +76,12 @@ def is_digit(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only decimal and digit characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DIGIT, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DIGIT, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -133,17 +91,12 @@ def is_numeric(Column source_strings): that contain only numeric characters. These include digit and numeric characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.NUMERIC, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.NUMERIC, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -152,17 +105,12 @@ def is_upper(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only upper-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.UPPER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.UPPER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -171,17 +119,12 @@ def is_lower(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only lower-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.LOWER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.LOWER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -190,14 +133,9 @@ def is_space(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contains all characters which are spaces only. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.SPACE, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.SPACE, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx index 3fad91bbfc0..3ef478532c2 100644 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ b/python/cudf/cudf/_lib/strings/translate.pyx @@ -1,25 +1,12 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.translate cimport ( - filter_characters as cpp_filter_characters, - filter_type, - translate as cpp_translate, -) -from pylibcudf.libcudf.types cimport char_utf8 - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +import pylibcudf as plc @acquire_spill_lock() @@ -29,30 +16,11 @@ def translate(Column source_strings, Translates individual characters within each string if present in the mapping_table. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - with nogil: - c_result = move(cpp_translate(source_view, c_mapping_table)) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.translate.translate( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -64,44 +32,11 @@ def filter_characters(Column source_strings, Removes or keeps individual characters within each string using the provided mapping_table. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_result = plc.strings.translate.filter_characters( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + plc.strings.translate.FilterType.KEEP + if keep else plc.strings.translate.FilterType.REMOVE, + py_repl.device_value.c_value ) - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - cdef filter_type c_keep - if keep is True: - c_keep = filter_type.KEEP - else: - c_keep = filter_type.REMOVE - - with nogil: - c_result = move(cpp_filter_characters( - source_view, - c_mapping_table, - c_keep, - scalar_repl[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..da422db5eae 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2546,9 +2546,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2719,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index abf4357f862..b8b4343173e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd index 5d54c1c3593..76afe047e8c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd @@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ CASE_TYPES ALL_TYPES -cdef extern from "cudf/strings/char_types/char_types.hpp" \ - namespace "cudf::strings" nogil: - cdef unique_ptr[column] all_characters_of_type( column_view source_strings, string_character_types types, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..3d048c1f50b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, + column_view input, column_view targets) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..4299cf62e99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, + column_view input, string_scalar delimiter) except + cdef unique_ptr[table] rpartition( - column_view source_strings, + column_view input, string_scalar delimiter) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..a22a79fc7d7 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + @@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd index 85fa719128a..9fd24f2987b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd @@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] translate( - column_view source_strings, + column_view input, vector[pair[char_utf8, char_utf8]] chars_table) except + - ctypedef enum filter_type: - KEEP 'cudf::strings::filter_type::KEEP', - REMOVE 'cudf::strings::filter_type::REMOVE' + cpdef enum class filter_type(bool): + KEEP + REMOVE cdef unique_ptr[column] filter_characters( - column_view source_strings, - vector[pair[char_utf8, char_utf8]] chars_table, - filter_type keep, + column_view input, + vector[pair[char_utf8, char_utf8]] characters_to_filter, + filter_type keep_characters, string_scalar replacement) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 142bc124ca2..d92f806efbe 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources contains.pyx extract.pyx find.pyx + find_multiple.pyx findall.pyx regex_flags.pyx regex_program.pyx @@ -28,6 +29,7 @@ set(cython_sources side_type.pyx slice.pyx strip.pyx + translate.pyx ) set(linked_libraries cudf::cudf) @@ -38,3 +40,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d8afccc7336..788e2c99ab1 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -9,12 +9,15 @@ from . cimport ( convert, extract, find, + find_multiple, findall, regex_flags, regex_program, replace, slice, + split, strip, + translate, ) from .side_type cimport side_type @@ -33,5 +36,7 @@ __all__ = [ "replace", "slice", "strip", + "split", "side_type", + "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 22452812e42..bcaeb073d0b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -9,13 +9,16 @@ convert, extract, find, + find_multiple, findall, regex_flags, regex_program, repeat, replace, slice, + split, strip, + translate, ) from .side_type import SideType @@ -34,5 +37,7 @@ "replace", "slice", "strip", + "split", "SideType", + "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd index ad4e4cf61d8..f9f7d244212 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd @@ -1,3 +1,19 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column from pylibcudf.libcudf.strings.char_types cimport string_character_types +from pylibcudf.scalar cimport Scalar + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index e7621fb4d84..6a24d79bc4b 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -1,4 +1,93 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport char_types as cpp_char_types +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +): + """ + Identifies strings where all characters match the specified type. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types : StringCharacterTypes + The character types to check in each string + verify_types : StringCharacterTypes + Only verify against these character types. + + Returns + ------- + Column + New column of boolean results for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +): + """ + Filter specific character types from a column of strings. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types_to_remove : StringCharacterTypes + The character types to check in each string. + replacement : Scalar + The replacement character to use when removing characters + types_to_keep : StringCharacterTypes + Default `ALL_TYPES` means all characters of `types_to_remove` + will be filtered. + + Returns + ------- + Column + New column with the specified characters filtered out and + replaced with the specified replacement string. + """ + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..413fc1cb79d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd new file mode 100644 index 00000000000..72086e57d9f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py new file mode 100644 index 00000000000..2033e5e275b --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..c18257a4787 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter=*) + +cpdef Table rpartition(Column input, Scalar delimiter=*) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..ecc959e65b0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,95 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table partition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..355a1874298 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..a7d7f39fc47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,326 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd new file mode 100644 index 00000000000..0ca746801d7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.translate cimport filter_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column translate(Column input, dict chars_table) + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx new file mode 100644 index 00000000000..a62c7ec4528 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport translate as cpp_translate +from pylibcudf.libcudf.types cimport char_utf8 +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference +from pylibcudf.libcudf.strings.translate import \ + filter_type as FilterType # no-cython-lint + + +cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): + """ + Convert str.maketrans table to cudf compatible table. + """ + cdef int table_size = len(table) + cdef vector[pair[char_utf8, char_utf8]] c_table + + c_table.reserve(table_size) + for key, value in table.items(): + if isinstance(value, int): + value = chr(value) + if isinstance(value, str): + value = int.from_bytes(value.encode(), byteorder='big') + if isinstance(key, int): + key = chr(key) + if isinstance(key, str): + key = int.from_bytes(key.encode(), byteorder='big') + c_table.push_back((key, value)) + + return c_table + + +cpdef Column translate(Column input, dict chars_table): + """ + Translates individual characters within each string. + + For details, see :cpp:func:`cudf::strings::translate`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + chars_table : dict + Table of UTF-8 character mappings + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table( + chars_table + ) + + with nogil: + c_result = move( + cpp_translate.translate( + input.view(), + c_chars_table + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +): + """ + Removes ranges of characters from each string in a strings column. + + For details, see :cpp:func:`cudf::strings::filter_characters`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + characters_to_filter : dict + Table of character ranges to filter on + + keep_characters : FilterType + If true, the `characters_to_filter` are retained + and all other characters are removed. + + replacement : Scalar + Replacement string for each character removed. + + Returns + ------- + Column + New column with filtered strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table( + characters_to_filter + ) + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py new file mode 100644 index 00000000000..bcd030c019e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +from utils import assert_column_eq + + +def test_all_characters_of_type(): + pa_array = pa.array(["1", "A"]) + result = plc.strings.char_types.all_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHA, + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.utf8_is_alpha(pa_array) + assert_column_eq(result, expected) + + +def test_filter_characters_of_type(): + pa_array = pa.array(["=A="]) + result = plc.strings.char_types.filter_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHANUM, + plc.interop.from_arrow(pa.scalar(" ")), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.replace_substring(pa_array, "A", " ") + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..d6b37a388f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + expected = pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=pa.list_(pa.int32()), + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..80cae8d1c6b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..2aeffac8209 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,130 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq, assert_table_eq + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py new file mode 100644 index 00000000000..2ae893e69fb --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def data_col(): + pa_data_col = pa.array( + ["aa", "bbb", "cccc", "abcd", None], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture +def trans_table(): + return str.maketrans("abd", "A Q") + + +def test_translate(data_col, trans_table): + pa_array, plc_col = data_col + result = plc.strings.translate.translate(plc_col, trans_table) + expected = pa.array( + [ + val.translate(trans_table) if isinstance(val, str) else None + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) + + +@pytest.mark.parametrize( + "keep", + [ + plc.strings.translate.FilterType.KEEP, + plc.strings.translate.FilterType.REMOVE, + ], +) +def test_filter_characters(data_col, trans_table, keep): + pa_array, plc_col = data_col + result = plc.strings.translate.filter_characters( + plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*")) + ) + exp_data = [] + flat_trans = set(trans_table.keys()).union(trans_table.values()) + for val in pa_array.to_pylist(): + if not isinstance(val, str): + exp_data.append(val) + else: + new_val = "" + for ch in val: + if ( + ch in flat_trans + and keep == plc.strings.translate.FilterType.KEEP + ): + new_val += ch + elif ( + ch not in flat_trans + and keep == plc.strings.translate.FilterType.REMOVE + ): + new_val += ch + else: + new_val += "*" + exp_data.append(new_val) + expected = pa.array(exp_data) + assert_column_eq(expected, result)