diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c37d05a21c7..900e9eed98e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -530,7 +530,7 @@ add_library( src/stream_compaction/apply_boolean_mask.cu src/stream_compaction/distinct.cu src/stream_compaction/distinct_count.cu - src/stream_compaction/distinct_reduce.cu + src/stream_compaction/distinct_helpers.cu src/stream_compaction/drop_nans.cu src/stream_compaction/drop_nulls.cu src/stream_compaction/stable_distinct.cu diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh new file mode 100644 index 00000000000..2d2b43f1d4a --- /dev/null +++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +namespace cudf::detail { + +using hash_map_type = + cuco::static_map; + +/** + * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are + * rows that compared equal. + * + * TODO: We need to switch to use `static_reduction_map` when it is ready + * (https://github.com/NVIDIA/cuCollections/pull/98). + */ +template +struct reduce_by_row_fn_base { + protected: + MapView const d_map; + KeyHasher const d_hasher; + KeyEqual const d_equal; + OutputType* const d_output; + + reduce_by_row_fn_base(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + OutputType* const d_output) + : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output} + { + } + + /** + * @brief Return a pointer to the output array at the given index. + * + * @param idx The access index + * @return A pointer to the given index in the output array + */ + __device__ OutputType* get_output_ptr(size_type const idx) const + { + auto const iter = d_map.find(idx, d_hasher, d_equal); + + if (iter != d_map.end()) { + // Only one (undetermined) index value of the duplicate rows could be inserted into the map. + // As such, looking up for all indices of duplicate rows always returns the same value. + auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); + + // All duplicate rows will have concurrent access to this same output slot. + return &d_output[inserted_idx]; + } else { + // All input `idx` values have been inserted into the map before. + // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if + // `d_equal(idx, idx) == false`. + // Such situations are due to comparing nulls or NaNs which are considered as always unequal. + // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct + // output slot. + return &d_output[idx]; + } + } +}; + +/** + * @brief Perform a reduction on groups of rows that are compared equal. + * + * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared + * equal. A hash table is used to find groups of equal rows. + * + * At the beginning of the operation, the entire output array is filled with a value given by + * the `init` parameter. Then, the reduction result for each row group is written into the output + * array at the index of an unspecified row in the group. + * + * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a + * reduction functor derived from `reduce_by_row_fn_base` + * @tparam OutputType Type of the reduction results + * @param map The auxiliary map to perform reduction + * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row + * comparisons + * @param num_rows The number of all input rows + * @param has_nulls Indicate whether the input rows has any nulls at any nested levels + * @param has_nested_columns Indicates whether the input table has any nested columns + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal. + * @param init The initial value for reduction of each row group + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned vector + * @return A device_uvector containing the reduction results + */ +template +rmm::device_uvector hash_reduce_by_row( + hash_map_type const& map, + std::shared_ptr const preprocessed_input, + size_type num_rows, + cudf::nullate::DYNAMIC has_nulls, + bool has_nested_columns, + null_equality nulls_equal, + nan_equality nans_equal, + ReduceFuncBuilder func_builder, + OutputType init, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const map_dview = map.get_device_view(); + auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); + auto const key_hasher = row_hasher.device_hasher(has_nulls); + auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); + + auto reduction_results = rmm::device_uvector(num_rows, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init); + + auto const reduce_by_row = [&](auto const value_comp) { + if (has_nested_columns) { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); + } else { + auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); + } + }; + + if (nans_equal == nan_equality::ALL_EQUAL) { + using nan_equal_comparator = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator; + reduce_by_row(nan_equal_comparator{}); + } else { + using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; + reduce_by_row(nan_unequal_comparator{}); + } + + return reduction_results; +} + +} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index cc60b2a12ea..cc1e3423d42 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "distinct_reduce.cuh" +#include "distinct_helpers.hpp" #include #include @@ -50,8 +50,8 @@ rmm::device_uvector get_distinct_indices(table_view const& input, } auto map = hash_map_type{compute_hash_table_size(input.num_rows()), - cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL}, - cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL}, + cuco::empty_key{-1}, + cuco::empty_value{std::numeric_limits::min()}, detail::hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; @@ -61,7 +61,7 @@ rmm::device_uvector get_distinct_indices(table_view const& input, auto const has_nested_columns = cudf::detail::has_nested_columns(input); auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); + auto const key_hasher = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); @@ -96,16 +96,16 @@ rmm::device_uvector get_distinct_indices(table_view const& input, } // For other keep options, reduce by row on rows that compare equal. - auto const reduction_results = hash_reduce_by_row(map, - std::move(preprocessed_input), - input.num_rows(), - has_nulls, - has_nested_columns, - keep, - nulls_equal, - nans_equal, - stream, - rmm::mr::get_current_device_resource()); + auto const reduction_results = reduce_by_row(map, + std::move(preprocessed_input), + input.num_rows(), + has_nulls, + has_nested_columns, + keep, + nulls_equal, + nans_equal, + stream, + rmm::mr::get_current_device_resource()); // Extract the desired output indices from reduction results. auto const map_end = [&] { diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 4bca0827efe..ac4811ad279 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -136,14 +136,14 @@ cudf::size_type distinct_count(table_view const& keys, auto const preprocessed_input = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const hash_key = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); + auto const hash_key = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const comparator_helper = [&](auto const row_equal) { using hasher_type = decltype(hash_key); auto key_set = cuco::experimental::static_set{ cuco::experimental::extent{compute_hash_table_size(num_rows)}, - cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL}, + cuco::empty_key{-1}, row_equal, cuco::experimental::linear_probing<1, hasher_type>{hash_key}, detail::hash_table_allocator_type{default_allocator{}, stream}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu new file mode 100644 index 00000000000..8f36ec98f4a --- /dev/null +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distinct_helpers.hpp" + +#include + +namespace cudf::detail { + +namespace { +/** + * @brief The functor to find the first/last/all duplicate row for rows that compared equal. + */ +template +struct reduce_fn : reduce_by_row_fn_base { + duplicate_keep_option const keep; + + reduce_fn(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + duplicate_keep_option const keep, + size_type* const d_output) + : reduce_by_row_fn_base{d_map, + d_hasher, + d_equal, + d_output}, + keep{keep} + { + } + + __device__ void operator()(size_type const idx) const + { + auto const out_ptr = this->get_output_ptr(idx); + + if (keep == duplicate_keep_option::KEEP_FIRST) { + // Store the smallest index of all rows that are equal. + atomicMin(out_ptr, idx); + } else if (keep == duplicate_keep_option::KEEP_LAST) { + // Store the greatest index of all rows that are equal. + atomicMax(out_ptr, idx); + } else { + // Count the number of rows in each group of rows that are compared equal. + atomicAdd(out_ptr, size_type{1}); + } + } +}; + +/** + * @brief The builder to construct an instance of `reduce_fn` functor base on the given + * value of the `duplicate_keep_option` member variable. + */ +struct reduce_func_builder { + duplicate_keep_option const keep; + + template + auto build(MapView const& d_map, + KeyHasher const& d_hasher, + KeyEqual const& d_equal, + size_type* const d_output) + { + return reduce_fn{d_map, d_hasher, d_equal, keep, d_output}; + } +}; + +} // namespace + +// This function is split from `distinct.cu` to improve compile time. +rmm::device_uvector reduce_by_row( + hash_map_type const& map, + std::shared_ptr const preprocessed_input, + size_type num_rows, + cudf::nullate::DYNAMIC has_nulls, + bool has_nested_columns, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY, + "This function should not be called with KEEP_ANY"); + + return hash_reduce_by_row(map, + preprocessed_input, + num_rows, + has_nulls, + has_nested_columns, + nulls_equal, + nans_equal, + reduce_func_builder{keep}, + reduction_init_value(keep), + stream, + mr); +} + +} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_helpers.hpp similarity index 92% rename from cpp/src/stream_compaction/distinct_reduce.cuh rename to cpp/src/stream_compaction/distinct_helpers.hpp index 8ec1fa18205..b667d0b04f0 100644 --- a/cpp/src/stream_compaction/distinct_reduce.cuh +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -14,18 +14,14 @@ * limitations under the License. */ -#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" -#include #include #include #include #include #include -#include - -#include namespace cudf::detail { @@ -56,6 +52,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) * - If `keep == KEEP_LAST`: max of row indices in the group. * - If `keep == KEEP_NONE`: count of equivalent rows (group size). * + * Note that this function is not needed when `keep == KEEP_NONE`. + * * At the beginning of the operation, the entire output array is filled with a value given by * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. @@ -68,11 +66,13 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) * @param has_nested_columns Indicates whether the input table has any nested columns * @param keep The parameter to determine what type of reduction to perform * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN values in floating point column should be + * considered equal. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the reduction results */ -rmm::device_uvector hash_reduce_by_row( +rmm::device_uvector reduce_by_row( hash_map_type const& map, std::shared_ptr const preprocessed_input, size_type num_rows, diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu deleted file mode 100644 index 020e6a495bc..00000000000 --- a/cpp/src/stream_compaction/distinct_reduce.cu +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distinct_reduce.cuh" - -#include -#include -#include - -namespace cudf::detail { - -namespace { -/** - * @brief A functor to perform reduce-by-key with keys are rows that compared equal. - * - * TODO: We need to switch to use `static_reduction_map` when it is ready - * (https://github.com/NVIDIA/cuCollections/pull/98). - */ -template -struct reduce_by_row_fn { - MapView const d_map; - KeyHasher const d_hasher; - KeyEqual const d_equal; - duplicate_keep_option const keep; - size_type* const d_output; - - reduce_by_row_fn(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - duplicate_keep_option const keep, - size_type* const d_output) - : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output} - { - } - - __device__ void operator()(size_type const idx) const - { - auto const out_ptr = get_output_ptr(idx); - - if (keep == duplicate_keep_option::KEEP_FIRST) { - // Store the smallest index of all rows that are equal. - atomicMin(out_ptr, idx); - } else if (keep == duplicate_keep_option::KEEP_LAST) { - // Store the greatest index of all rows that are equal. - atomicMax(out_ptr, idx); - } else { - // Count the number of rows in each group of rows that are compared equal. - atomicAdd(out_ptr, size_type{1}); - } - } - - private: - __device__ size_type* get_output_ptr(size_type const idx) const - { - auto const iter = d_map.find(idx, d_hasher, d_equal); - - if (iter != d_map.end()) { - // Only one index value of the duplicate rows could be inserted into the map. - // As such, looking up for all indices of duplicate rows always returns the same value. - auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); - - // All duplicate rows will have concurrent access to this same output slot. - return &d_output[inserted_idx]; - } else { - // All input `idx` values have been inserted into the map before. - // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if - // `d_equal(idx, idx) == false`. - // Such situations are due to comparing nulls or NaNs which are considered as always unequal. - // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct - // output slot. - return &d_output[idx]; - } - } -}; - -} // namespace - -rmm::device_uvector hash_reduce_by_row( - hash_map_type const& map, - std::shared_ptr const preprocessed_input, - size_type num_rows, - cudf::nullate::DYNAMIC has_nulls, - bool has_nested_columns, - duplicate_keep_option keep, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY, - "This function should not be called with KEEP_ANY"); - - auto reduction_results = rmm::device_uvector(num_rows, stream, mr); - - thrust::uninitialized_fill(rmm::exec_policy(stream), - reduction_results.begin(), - reduction_results.end(), - reduction_init_value(keep)); - - auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); - - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - - auto const reduce_by_row = [&](auto const value_comp) { - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - reduce_by_row_fn{ - map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - reduce_by_row_fn{ - map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()}); - } - }; - - if (nans_equal == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - reduce_by_row(nan_equal_comparator{}); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - reduce_by_row(nan_unequal_comparator{}); - } - - return reduction_results; -} - -} // namespace cudf::detail diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 4779cd990fd..839672d6a56 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -29,28 +29,6 @@ namespace cudf { namespace detail { -namespace experimental { - -/** - * @brief Device callable to hash a given row. - */ -template -class compaction_hash { - public: - compaction_hash(RowHash row_hasher) : _hash{row_hasher} {} - - __device__ inline auto operator()(size_type i) const noexcept - { - auto hash = _hash(i); - return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; - } - - private: - RowHash _hash; -}; - -} // namespace experimental - /**  * @brief Device functor to determine if a row is valid.  */ diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp index 0cd2d8f4b14..58d958d2ff4 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.hpp +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -30,11 +30,6 @@ namespace cudf { namespace detail { -constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); -constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); - -using hash_type = cuco::murmurhash3_32; - using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; using hash_map_type = diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2f6e864b51c..c0bd9ec6eee 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -19,6 +19,7 @@ drop_nulls, ) from cudf._lib.types import size_type_dtype +from cudf.api.extensions import no_default from cudf.api.types import ( is_bool_dtype, is_integer, @@ -701,21 +702,65 @@ def fillna(self, value, downcast=None): return super().fillna(value=value) - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default): """Create a DataFrame with a column containing this Index Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original Index - name : str, default None - Name to be used for the column + name : object, defaults to index.name + The passed name should substitute for the index name (if it has + one). + Returns ------- DataFrame - cudf DataFrame - """ - if name is not None: + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves " + "the Index's name or uses a default name of 0. This " + "behaviour is deprecated, and in the future `None` " + "will be used as the name of the " + "resulting DataFrame column.", + FutureWarning, + ) + name = no_default + if name is not no_default: col_name = name elif self.name is None: col_name = 0 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 59ab3569814..d2e2f11a12e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2519,11 +2519,11 @@ def _construct_array( arbitrary = cupy.asarray(arbitrary, dtype=dtype) except (TypeError, ValueError): native_dtype = dtype - inferred_dtype = None + inferred_dtype = infer_dtype(arbitrary, skipna=False) if ( dtype is None and not cudf._lib.scalar._is_null_host_scalar(arbitrary) - and (inferred_dtype := infer_dtype(arbitrary, skipna=False)) + and inferred_dtype in ( "mixed", "mixed-integer", @@ -2533,6 +2533,20 @@ def _construct_array( if inferred_dtype == "interval": # Only way to construct an Interval column. return pd.array(arbitrary) + elif ( + inferred_dtype == "string" and getattr(dtype, "kind", None) == "M" + ): + # We may have date-like strings with timezones + try: + pd_arbitrary = pd.to_datetime(arbitrary) + if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) + except pd.errors.OutOfBoundsDatetime: + # https://github.com/pandas-dev/pandas/issues/55096 + pass + arbitrary = np.asarray( arbitrary, dtype=native_dtype diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index da6c4fb858c..7775723e267 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str: fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: + if "%z" in fmt or "%Z" in fmt: + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) return fmt element_parts = element.split(".") @@ -651,11 +655,12 @@ def infer_format(element: str, **kwargs) -> str: raise ValueError("Unable to infer the timestamp format from the data") if len(second_parts) > 1: - # "Z" indicates Zulu time(widely used in aviation) - Which is - # UTC timezone that currently cudf only supports. Having any other - # unsupported timezone will let the code fail below - # with a ValueError. - second_parts.remove("Z") + # We may have a non-digit, timezone-like component + # like Z, UTC-3, +01:00 + if any(re.search(r"\D", part) for part in second_parts): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) second_part = "".join(second_parts[1:]) if len(second_part) > 1: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc6726879c1..21380bb841c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf from cudf._typing import DataFrameOrSeries +from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column from cudf.core._compat import PANDAS_GE_150 @@ -1015,7 +1016,12 @@ def __getitem__(self, index): elif isinstance(index, slice): start, stop, step = index.indices(len(self)) index = column.arange(start, stop, step) - result = MultiIndex.from_frame(self.to_frame(index=False).take(index)) + result = MultiIndex.from_frame( + self.to_frame(index=False, name=range(0, self.nlevels)).take( + index + ), + names=self.names, + ) # we are indexing into a single row of the MultiIndex, # return that row as a tuple: @@ -1026,24 +1032,95 @@ def __getitem__(self, index): result._codes = self._codes.take(index) if self._levels is not None: result._levels = self._levels - result.names = self.names return result @_cudf_nvtx_annotate - def to_frame(self, index=True, name=None): + def to_frame(self, index=True, name=no_default, allow_duplicates=False): + """ + Create a DataFrame with the levels of the MultiIndex as columns. + + Column ordering is determined by the DataFrame constructor with data as + a dict. + + Parameters + ---------- + index : bool, default True + Set the index of the returned DataFrame as the original MultiIndex. + name : list / sequence of str, optional + The passed names should substitute index level names. + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. Note + that this parameter is non-functional because + duplicates column labels aren't supported in cudf. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf + >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d + """ # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further # modifications of the resulting DataFrame will affect the MultiIndex. - df = cudf.DataFrame._from_data(data=self._data) - if index: - df = df.set_index(self) - if name is not None: + if name is None: + warnings.warn( + "Explicitly passing `name=None` currently preserves the " + "Index's name or uses a default name of 0. This behaviour " + "is deprecated, and in the future `None` will be used " + "as the name of the resulting DataFrame column.", + FutureWarning, + ) + name = no_default + + if name is not no_default: if len(name) != len(self.levels): raise ValueError( "'name' should have the same length as " "number of levels on index." ) - df.columns = name + column_names = name + else: + column_names = self.names + all_none_names = None + if not ( + all_none_names := all(x is None for x in column_names) + ) and len(column_names) != len(set(column_names)): + raise ValueError("Duplicate column names are not allowed") + df = cudf.DataFrame._from_data( + data=self._data, + columns=column_names + if name is not no_default and not all_none_names + else None, + ) + + if index: + df = df.set_index(self) + return df @_cudf_nvtx_annotate @@ -1504,7 +1581,9 @@ def droplevel(self, level=-1): @_cudf_nvtx_annotate def to_pandas(self, nullable=False, **kwargs): - result = self.to_frame(index=False).to_pandas(nullable=nullable) + result = self.to_frame( + index=False, name=list(range(self.nlevels)) + ).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod @@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self): Convert all na values(if any) in MultiIndex object to `` as a preprocessing step to `__repr__` methods. """ - index_df = self.to_frame(index=False) + index_df = self.to_frame(index=False, name=list(range(self.nlevels))) return MultiIndex.from_frame( index_df._clean_nulls_from_dataframe(index_df), names=self.names ) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7c019f0722c..6a56ab8f3a5 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -49,9 +49,11 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only: + if numeric_only and not isinstance( + self._column, cudf.core.column.numerical_base.NumericalBaseColumn + ): raise NotImplementedError( - f"Series.{op} does not implement numeric_only" + f"Series.{op} does not implement numeric_only." ) try: return getattr(self._column, op)(**kwargs) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f736e055163..a3f4bacf206 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -353,15 +353,16 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): format=format, ) else: - if infer_datetime_format and format is None: + if format is None: + if not infer_datetime_format and dayfirst: + raise NotImplementedError( + f"{dayfirst=} not implemented " + f"when {format=} and {infer_datetime_format=}." + ) format = column.datetime.infer_format( element=col.element_indexing(0), dayfirst=dayfirst, ) - elif format is None: - format = column.datetime.infer_format( - element=col.element_indexing(0) - ) return col.as_datetime_column( dtype=_unit_dtype_map[unit], format=format, diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4c20258ae67..164856ed6f5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -617,22 +617,44 @@ def test_datetime_dataframe(): @pytest.mark.parametrize("infer_datetime_format", [True, False]) def test_cudf_to_datetime(data, dayfirst, infer_datetime_format): pd_data = data + is_string_data = False if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) + is_string_data = ( + gd_data.ndim == 1 + and not gd_data.empty + and gd_data.dtype.kind == "O" + ) else: if type(pd_data).__module__ == np.__name__: gd_data = cp.array(pd_data) else: gd_data = pd_data + is_string_data = isinstance(gd_data, list) and isinstance( + next(iter(gd_data), None), str + ) - expected = pd.to_datetime( - pd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - actual = cudf.to_datetime( - gd_data, dayfirst=dayfirst, infer_datetime_format=infer_datetime_format - ) - - assert_eq(actual, expected) + if dayfirst and not infer_datetime_format and is_string_data: + # Note: pandas<2.0 also does not respect dayfirst=True correctly + # for object data + with pytest.raises(NotImplementedError): + cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + else: + expected = pd.to_datetime( + pd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + actual = cudf.to_datetime( + gd_data, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format, + ) + assert_eq(actual, expected) @pytest.mark.parametrize( @@ -1250,40 +1272,31 @@ def test_datetime_reductions(data, op, dtype): assert_eq(expected, actual) +@pytest.mark.parametrize("timezone", ["naive", "UTC"]) @pytest.mark.parametrize( "data", [ - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), - timezone="UTC", - ), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), ], ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_infer_format(data, dtype): - sr = cudf.Series(data) - psr = pd.Series(data) +def test_datetime_infer_format(data, timezone, dtype): + ts_data = np.datetime_as_string(data, timezone=timezone) + sr = cudf.Series(ts_data) + if timezone == "naive": + psr = pd.Series(ts_data) - expected = psr.astype(dtype) - actual = sr.astype(dtype) + expected = psr.astype(dtype) + actual = sr.astype(dtype) - assert_eq(expected, actual) + assert_eq(expected, actual) + else: + with pytest.raises(NotImplementedError): + sr.astype(dtype) def test_dateoffset_instance_subclass_check(): @@ -2150,6 +2163,12 @@ def test_daterange_pandas_compatibility(): assert_eq(expected, actual) +def test_strings_with_utc_offset_not_implemented(): + with pytest.warns(DeprecationWarning, match="parsing timezone"): # cupy + with pytest.raises(NotImplementedError): + DatetimeIndex(["2022-07-22 00:00:00+02:00"]) + + @pytest.mark.parametrize("code", ["z", "Z"]) def test_format_timezone_not_implemented(code): with pytest.raises(NotImplementedError): @@ -2158,6 +2177,12 @@ def test_format_timezone_not_implemented(code): ) +@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"]) +def test_no_format_timezone_not_implemented(tz): + with pytest.raises(NotImplementedError): + cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) + + @pytest.mark.parametrize("arg", [True, False]) def test_args_not_datetime_typerror(arg): with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 39b36aae7ab..366a70033b2 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 from cudf.core.index import ( @@ -2785,3 +2786,21 @@ def test_empty_index_init(): gidx = cudf.Index([]) assert_eq(pidx, gidx) + + +@pytest.mark.parametrize( + "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] +) +@pytest.mark.parametrize("data_name", [None, 1, "abc"]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) +def test_index_to_frame(data, data_name, index, name): + pidx = pd.Index(data, name=data_name) + gidx = cudf.from_pandas(pidx) + + with expect_warning_if(name is None): + expected = pidx.to_frame(index=index, name=name) + with expect_warning_if(name is None): + actual = gidx.to_frame(index=index, name=name) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 3c843ace0a8..fb2b0c07efb 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -16,6 +16,7 @@ import pytest import cudf +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_GE_200 from cudf.core.column import as_column from cudf.core.index import as_index @@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error(): midx = cudf.MultiIndex.from_tuples([("a", "b")]) with pytest.raises(NotImplementedError): midx.to_series() + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "a", "a"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] +) +@pytest.mark.parametrize("allow_duplicates", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +def test_multiindex_to_frame_allow_duplicates( + pidx, name, allow_duplicates, index +): + gidx = cudf.from_pandas(pidx) + + if ( + ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + ) + and not allow_duplicates + and (name is None or name is no_default) + ): + assert_exceptions_equal( + pidx.to_frame, + gidx.to_frame, + lfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + ) + else: + if ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + and not isinstance(name, list) + ) or (isinstance(name, list) and len(name) != len(set(name))): + # cudf doesn't have the ability to construct dataframes + # with duplicate column names + with expect_warning_if(name is None): + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) + else: + with expect_warning_if(name is None): + expected = pidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + with expect_warning_if(name is None): + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 9c8f591d9b0..71e1c1d8d1d 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -250,30 +250,37 @@ def test_misc_quantiles(data, q): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis_series(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_kurtosis_series(data, null_flag, numeric_only): pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - got = data.kurtosis() + got = data.kurtosis(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurtosis() + expected = pdata.kurtosis(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt() + got = data.kurt(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt() + expected = pdata.kurt(numeric_only=numeric_only) np.testing.assert_array_almost_equal(got, expected) - got = data.kurt(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - expected = pdata.kurt(numeric_only=False) - np.testing.assert_array_almost_equal(got, expected) - with pytest.raises(NotImplementedError): - data.kurt(numeric_only=True) +@pytest.mark.parametrize("op", ["skew", "kurt"]) +def test_kurt_skew_error(op): + gs = cudf.Series(["ab", "cd"]) + ps = gs.to_pandas() + + with pytest.raises(FutureWarning): + assert_exceptions_equal( + getattr(gs, op), + getattr(ps, op), + lfunc_args_and_kwargs=([], {"numeric_only": True}), + rfunc_args_and_kwargs=([], {"numeric_only": True}), + ) @pytest.mark.parametrize( @@ -295,26 +302,19 @@ def test_kurtosis_series(data, null_flag): ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew_series(data, null_flag): +@pytest.mark.parametrize("numeric_only", [False, True]) +def test_skew_series(data, null_flag, numeric_only): pdata = data.to_pandas() if null_flag and len(data) > 2: data.iloc[[0, 2]] = None pdata.iloc[[0, 2]] = None - got = data.skew() - expected = pdata.skew() + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) - got = data.skew(numeric_only=False) - expected = pdata.skew(numeric_only=False) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) - - with pytest.raises(NotImplementedError): - data.skew(numeric_only=True) - @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2bddd93ccb8..d54027eb707 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -200,12 +200,12 @@ def test_string_astype(dtype): data = ["True", "False", "True", "False", "False"] elif dtype.startswith("datetime64"): data = [ - "2019-06-04T00:00:00Z", - "2019-06-04T12:12:12Z", - "2019-06-03T00:00:00Z", - "2019-05-04T00:00:00Z", - "2018-06-04T00:00:00Z", - "1922-07-21T01:02:03Z", + "2019-06-04T00:00:00", + "2019-06-04T12:12:12", + "2019-06-03T00:00:00", + "2019-05-04T00:00:00", + "2018-06-04T00:00:00", + "1922-07-21T01:02:03", ] elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"]