From d15bbfdded7181fdc23d33fa5efae181b4af2e2b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 07:45:54 -1000 Subject: [PATCH 1/4] Allow melt(var_name=) to be a falsy label (#16981) closes https://github.com/rapidsai/cudf/issues/16972 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16981 --- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/tests/test_reshape.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 6e5abb2b82b..3d132c92d54 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..3adbe1d2a74 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -119,6 +119,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( From 04c17ded6563f4caaeeb51319672c10587401e33 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:06:23 -0400 Subject: [PATCH 2/4] [FEA] Migrate nvtext/edit_distance APIs to pylibcudf (#16957) Apart of #15162. This PR migrates `edit_distance.pxd` to pylibcudf Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/16957 --- cpp/include/nvtext/edit_distance.hpp | 2 +- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../pylibcudf/nvtext/edit_distance.rst | 6 ++ .../api_docs/pylibcudf/nvtext/index.rst | 7 +++ .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 34 +++------- python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 2 + python/pylibcudf/pylibcudf/__init__.py | 2 + .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 22 +++++++ .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 7 +++ python/pylibcudf/pylibcudf/nvtext/__init__.py | 7 +++ .../pylibcudf/nvtext/edit_distance.pxd | 8 +++ .../pylibcudf/nvtext/edit_distance.pyx | 63 +++++++++++++++++++ .../tests/test_nvtext_edit_distance.py | 34 ++++++++++ 14 files changed, 171 insertions(+), 25 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst create mode 100644 python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/nvtext/__init__.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/__init__.py create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..052479d6720 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -49,3 +49,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..b5cd5ee42c3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,7 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..1d72eacac12 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -66,3 +66,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..b98b37fe0fd 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -78,4 +79,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..304f27be340 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -92,4 +93,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..ebe1fda1f12 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources edit_distance.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..82f7c425b1d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..986652a241f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import edit_distance + +__all__ = [ + "edit_distance", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..fc98ccbc50c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance(c_strings, c_targets)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_edit_distance_matrix(c_strings)) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..7d93c471cc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) From efaa0b50c6ffd15c6506847987cb531e5f6ba955 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 4 Oct 2024 08:20:34 -1000 Subject: [PATCH 3/4] Add string.convert.convert_datetime/convert_booleans APIs to pylibcudf (#16971) Contributes to https://github.com/rapidsai/cudf/issues/15162 Also address a review in https://github.com/rapidsai/cudf/pull/16935#discussion_r1783726477 This also modifies some `format` arguments in `convert_datetime.pyx` to accept `str` instead of `bytes` (`const string&`) to align more with Python. Let me know if you prefer to change this back Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16971 --- python/cudf/cudf/_lib/string_casting.pyx | 110 +++--------------- python/cudf_polars/cudf_polars/dsl/expr.py | 4 +- .../strings/convert/convert_booleans.pxd | 4 +- .../strings/convert/convert_datetime.pxd | 6 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 2 +- .../pylibcudf/strings/convert/__init__.py | 2 +- .../strings/convert/convert_booleans.pxd | 9 ++ .../strings/convert/convert_booleans.pyx | 91 +++++++++++++++ .../strings/convert/convert_datetime.pxd | 11 +- .../strings/convert/convert_datetime.pyx | 82 +++++++++++-- .../pylibcudf/tests/test_string_convert.py | 2 +- .../tests/test_string_convert_booleans.py | 26 +++++ .../tests/test_string_convert_datetime.py | 46 ++++++++ .../pylibcudf/tests/test_string_wrap.py | 5 +- 15 files changed, 286 insertions(+), 116 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..55ff38f472d 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -3,9 +3,6 @@ from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from libcpp.memory cimport unique_ptr @@ -14,14 +11,6 @@ from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, @@ -427,77 +416,21 @@ def stoul(Column input_col): return string_to_integer(input_col, cudf.dtype("uint64")) -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -520,11 +453,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -545,12 +477,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -572,16 +503,11 @@ def istimestamp(Column input_col, str format): """ if input_col.size == 0: return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..54476b7fedc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -914,7 +914,7 @@ def do_evaluate( col = self.children[0].evaluate(df, context=context, mapping=mapping) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() + col.obj, format ) if strict: @@ -937,7 +937,7 @@ def do_evaluate( ) return Column( plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() + res.columns()[0], self.dtype, format ) ) elif self.name == pl_expr.StringFunction.Replace: diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..3febc78dfd2 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_booleans.pyx convert_durations.pyx convert_datetime.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..5525bca46d6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport convert_booleans, convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..2340ebe9a26 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import convert_booleans, convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..0c10f821ab6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..22bb4971cb1 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -62,7 +62,7 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..117c59ff1b8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..f3e84286a36 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,46 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py index 85abd3a2bae..a1c820cd586 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -7,6 +7,7 @@ def test_wrap(): + width = 12 pa_array = pa.array( [ "the quick brown fox jumped over the lazy brown dog", @@ -14,10 +15,10 @@ def test_wrap(): None, ] ) - result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), 12) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) expected = pa.array( [ - textwrap.fill(val, 12) if isinstance(val, str) else val + textwrap.fill(val, width) if isinstance(val, str) else val for val in pa_array.to_pylist() ] ) From a8da1ff2b393abbafa27dddcf4c19481ec853c28 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 4 Oct 2024 12:11:31 -0700 Subject: [PATCH 4/4] Deprecate support for directly accessing logger (#16964) This PR removes support for accessing cudf's underlying spdlog logger directly. Contributes to https://github.com/rapidsai/build-planning/issues/104 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16964 --- cpp/include/cudf/detail/utilities/logger.hpp | 14 ++++---- cpp/include/cudf/utilities/logger.hpp | 8 ++++- cpp/src/utilities/logger.cpp | 4 ++- cpp/tests/utilities_tests/logger_tests.cpp | 37 ++++++++++---------- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); }