From 187053abc4b3941ab1fa26828d396042e91c2b10 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:18:46 -0800 Subject: [PATCH] Remove cudf._lib.string_casting in favor of inlining pylibcudf (#17460) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17460 --- python/cudf/cudf/_lib/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/string_casting.pyx | 598 ---------------------- python/cudf/cudf/core/column/datetime.py | 12 +- python/cudf/cudf/core/column/numerical.py | 40 +- python/cudf/cudf/core/column/string.py | 170 +++--- python/cudf/cudf/core/column/timedelta.py | 12 +- python/cudf/cudf/core/tools/numeric.py | 3 +- 8 files changed, 129 insertions(+), 711 deletions(-) delete mode 100644 python/cudf/cudf/_lib/string_casting.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 5b9fa83b33c..bfbfbfed333 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx string_casting.pyx strings_udf.pyx - types.pyx utils.pyx +set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx strings_udf.pyx types.pyx + utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 63090ef86c8..e18e05cc43e 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -4,7 +4,6 @@ from . import ( groupby, interop, - string_casting, strings_udf, ) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx deleted file mode 100644 index 06ee07d8e2b..00000000000 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ /dev/null @@ -1,598 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column - -import pylibcudf as plc -from pylibcudf.types cimport DataType - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.types cimport dtype_to_pylibcudf_type - - -def floating_to_string(Column input_col): - plc_column = plc.strings.convert.convert_floats.from_floats( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def string_to_floating(Column input_col, DataType out_type): - plc_column = plc.strings.convert.convert_floats.to_floats( - input_col.to_pylibcudf(mode="read"), - out_type - ) - return Column.from_pylibcudf(plc_column) - - -def dtos(Column input_col): - """ - Converting/Casting input column of type double to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with double values cast to string - """ - - return floating_to_string(input_col) - - -def stod(Column input_col): - """ - Converting/Casting input column of type string to double - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to double - """ - - return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) - - -def ftos(Column input_col): - """ - Converting/Casting input column of type float to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with float values cast to string - """ - - return floating_to_string(input_col) - - -def stof(Column input_col): - """ - Converting/Casting input column of type string to float - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to float - """ - - return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) - - -def integer_to_string(Column input_col): - plc_column = plc.strings.convert.convert_integers.from_integers( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def string_to_integer(Column input_col, DataType out_type): - plc_column = plc.strings.convert.convert_integers.to_integers( - input_col.to_pylibcudf(mode="read"), - out_type - ) - return Column.from_pylibcudf(plc_column) - - -def i8tos(Column input_col): - """ - Converting/Casting input column of type int8 to string column - - Parameters - ---------- - input_col : input column of type int8 - - Returns - ------- - A Column with int8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi8(Column input_col): - """ - Converting/Casting input column of type string to int8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int8 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) - - -def i16tos(Column input_col): - """ - Converting/Casting input column of type int16 to string column - - Parameters - ---------- - input_col : input column of type int16 - - Returns - ------- - A Column with int16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi16(Column input_col): - """ - Converting/Casting input column of type string to int16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int16 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) - - -def itos(Column input_col): - """ - Converting/Casting input column of type int32 to string column - - Parameters - ---------- - input_col : input column of type int32 - - Returns - ------- - A Column with int32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi(Column input_col): - """ - Converting/Casting input column of type string to int32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int32 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) - - -def ltos(Column input_col): - """ - Converting/Casting input column of type int64 to string column - - Parameters - ---------- - input_col : input column of type int64 - - Returns - ------- - A Column with int64 values cast to string - """ - - return integer_to_string(input_col) - - -def stol(Column input_col): - """ - Converting/Casting input column of type string to int64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int64 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) - - -def ui8tos(Column input_col): - """ - Converting/Casting input column of type uint8 to string column - - Parameters - ---------- - input_col : input column of type uint8 - - Returns - ------- - A Column with uint8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui8(Column input_col): - """ - Converting/Casting input column of type string to uint8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint8 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) - - -def ui16tos(Column input_col): - """ - Converting/Casting input column of type uint16 to string column - - Parameters - ---------- - input_col : input column of type uint16 - - Returns - ------- - A Column with uint16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui16(Column input_col): - """ - Converting/Casting input column of type string to uint16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint16 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) - - -def uitos(Column input_col): - """ - Converting/Casting input column of type uint32 to string column - - Parameters - ---------- - input_col : input column of type uint32 - - Returns - ------- - A Column with uint32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui(Column input_col): - """ - Converting/Casting input column of type string to uint32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint32 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) - - -def ultos(Column input_col): - """ - Converting/Casting input column of type uint64 to string column - - Parameters - ---------- - input_col : input column of type uint64 - - Returns - ------- - A Column with uint64 values cast to string - """ - - return integer_to_string(input_col) - - -def stoul(Column input_col): - """ - Converting/Casting input column of type string to uint64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint64 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) - - -def to_booleans(Column input_col): - plc_column = plc.strings.convert.convert_booleans.to_booleans( - input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").c_value, - ) - return Column.from_pylibcudf(plc_column) - - -def from_booleans(Column input_col): - plc_column = plc.strings.convert.convert_booleans.from_booleans( - input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").c_value, - as_device_scalar("False").c_value, - ) - return Column.from_pylibcudf(plc_column) - - -def int2timestamp( - Column input_col, - str format, - Column names): - """ - Converting/Casting input date-time column to string - column with specified format - - Parameters - ---------- - input_col : input column of type timestamp in integer format - format : The string specifying output format - names : The string names to use for weekdays ("%a", "%A") and - months ("%b", "%B") - - Returns - ------- - A Column with date-time represented in string format - - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.from_timestamps( - input_col.to_pylibcudf(mode="read"), - format, - names.to_pylibcudf(mode="read") - ) - ) - - -def timestamp2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to date-time column with specified - timestamp_format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in date-time format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.to_timestamps( - input_col.to_pylibcudf(mode="read"), - dtype, - format - ) - ) - - -def istimestamp(Column input_col, str format): - """ - Check input string column matches the specified timestamp format - - Parameters - ---------- - input_col : input column of type string - - format : format string of timestamp specifiers - - Returns - ------- - A Column of boolean values identifying strings that matched the format. - - """ - plc_column = plc.strings.convert.convert_datetime.is_timestamp( - input_col.to_pylibcudf(mode="read"), - format - ) - return Column.from_pylibcudf(plc_column) - - -def timedelta2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to TimeDelta column with specified - format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in TimeDelta format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.to_durations( - input_col.to_pylibcudf(mode="read"), - dtype, - format - ) - ) - - -def int2timedelta(Column input_col, str format): - """ - Converting/Casting input Timedelta column to string - column with specified format - - Parameters - ---------- - input_col : input column of type Timedelta in integer format - - Returns - ------- - A Column with Timedelta represented in string format - - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.from_durations( - input_col.to_pylibcudf(mode="read"), - format - ) - ) - - -def int2ip(Column input_col): - """ - Converting/Casting integer column to string column in ipv4 format - - Parameters - ---------- - input_col : input integer column - - Returns - ------- - A Column with integer represented in string ipv4 format - - """ - plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( - input_col.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def ip2int(Column input_col): - """ - Converting string ipv4 column to integer column - - Parameters - ---------- - input_col : input string column - - Returns - ------- - A Column with ipv4 represented as integer - - """ - plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( - input_col.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def is_ipv4(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn - where nnn is integer digits in [0,255]. - """ - plc_column = plc.strings.convert.convert_ipv4.is_ipv4( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def htoi(Column input_col): - """ - Converting input column of type string having hex values - to integer of out_type - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column of integers parsed from hexadecimal string values. - """ - plc_column = plc.strings.convert.convert_integers.hex_to_integers( - input_col.to_pylibcudf(mode="read"), - plc.DataType(plc.TypeId.INT64) - ) - return Column.from_pylibcudf(plc_column) - - -def is_hex(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have hex characters. - """ - plc_column = plc.strings.convert.convert_integers.is_hex( - source_strings.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def itoh(Column input_col): - """ - Converting input column of type integer to a string - column with hexadecimal character digits. - - Parameters - ---------- - input_col : input column of type integer - - Returns - ------- - A Column of strings with hexadecimal characters. - """ - plc_column = plc.strings.convert.convert_integers.integers_to_hex( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c991f291eec..1a820da3c62 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -19,7 +19,6 @@ import cudf import cudf.core.column.column as column -import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import binaryop, unary @@ -602,9 +601,14 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: names = as_column(_DATETIME_NAMES) else: names = column.column_empty(0, dtype="object") - return string._datetime_to_str_typecast_functions[self.dtype]( - self, format, names - ) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_datetime.from_timestamps( + self.to_pylibcudf(mode="read"), + format, + names.to_pylibcudf(mode="read"), + ) + ) def as_string_column(self) -> cudf.core.column.StringColumn: format = _dtype_to_format_conversion.get( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f099cef3331..4405e153b0c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,8 +14,6 @@ import cudf import cudf.core.column.column as column -import cudf.core.column.string as string -from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop, unary from cudf.core.buffer import acquire_spill_lock, as_buffer @@ -366,22 +364,42 @@ def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar: else: return NotImplemented - def int2ip(self) -> "cudf.core.column.StringColumn": - if self.dtype != cudf.dtype("uint32"): + @acquire_spill_lock() + def int2ip(self) -> cudf.core.column.StringColumn: + if self.dtype != np.dtype(np.uint32): raise TypeError("Only uint32 type can be converted to ip") - - return libcudf.string_casting.int2ip(self) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def as_string_column(self) -> cudf.core.column.StringColumn: - if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self) - else: + if len(self) == 0: return cast( cudf.core.column.StringColumn, column.column_empty(0, dtype="object"), ) + elif self.dtype.kind == "b": + conv_func = functools.partial( + plc.strings.convert.convert_booleans.from_booleans, + true_string=cudf.Scalar( + "True", dtype="str" + ).device_value.c_value, + false_string=cudf.Scalar( + "False", dtype="str" + ).device_value.c_value, + ) + elif self.dtype.kind in {"i", "u"}: + conv_func = plc.strings.convert.convert_integers.from_integers + elif self.dtype.kind == "f": + conv_func = plc.strings.convert.convert_floats.from_floats + else: + raise ValueError(f"No string conversion from type {self.dtype}") + + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + conv_func(self.to_pylibcudf(mode="read")) + ) def as_datetime_column( self, dtype: Dtype diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0c93f60eab2..fcdcb789f23 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,9 +20,8 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype +from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock @@ -49,62 +48,7 @@ from cudf.core.column.numerical import NumericalColumn -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - with acquire_spill_lock(): - plc_column = plc.strings.attributes.count_characters( - column.to_pylibcudf(mode="read") - ) - result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) - - -_str_to_numeric_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.stoi8, - cudf.api.types.dtype("int16"): str_cast.stoi16, - cudf.api.types.dtype("int32"): str_cast.stoi, - cudf.api.types.dtype("int64"): str_cast.stol, - cudf.api.types.dtype("uint8"): str_cast.stoui8, - cudf.api.types.dtype("uint16"): str_cast.stoui16, - cudf.api.types.dtype("uint32"): str_cast.stoui, - cudf.api.types.dtype("uint64"): str_cast.stoul, - cudf.api.types.dtype("float32"): str_cast.stof, - cudf.api.types.dtype("float64"): str_cast.stod, - cudf.api.types.dtype("bool"): str_to_boolean, -} - -_numeric_to_str_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.i8tos, - cudf.api.types.dtype("int16"): str_cast.i16tos, - cudf.api.types.dtype("int32"): str_cast.itos, - cudf.api.types.dtype("int64"): str_cast.ltos, - cudf.api.types.dtype("uint8"): str_cast.ui8tos, - cudf.api.types.dtype("uint16"): str_cast.ui16tos, - cudf.api.types.dtype("uint32"): str_cast.uitos, - cudf.api.types.dtype("uint64"): str_cast.ultos, - cudf.api.types.dtype("float32"): str_cast.ftos, - cudf.api.types.dtype("float64"): str_cast.dtos, - cudf.api.types.dtype("bool"): str_cast.from_booleans, -} - -_datetime_to_str_typecast_functions = { - # TODO: support Date32 UNIX days - # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp, -} - -_timedelta_to_str_typecast_functions = { - cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta, -} - - -def _is_supported_regex_flags(flags): +def _is_supported_regex_flags(flags: int) -> bool: return flags == 0 or ( (flags & (re.MULTILINE | re.DOTALL) != 0) and (flags & ~(re.MULTILINE | re.DOTALL) == 0) @@ -155,10 +99,7 @@ def htoi(self) -> SeriesOrIndex: 3 51966 dtype: int64 """ - - out = str_cast.htoi(self._column) - - return self._return_or_inplace(out, inplace=False) + return self._return_or_inplace(self._column.hex_to_integers()) hex_to_int = htoi @@ -188,10 +129,7 @@ def ip2int(self) -> SeriesOrIndex: 2 0 dtype: int64 """ - - out = str_cast.ip2int(self._column) - - return self._return_or_inplace(out, inplace=False) + return self._return_or_inplace(self._column.ipv4_to_integers()) ip_to_int = ip2int @@ -1380,7 +1318,7 @@ def ishex(self) -> SeriesOrIndex: 4 True dtype: bool """ - return self._return_or_inplace(str_cast.is_hex(self._column)) + return self._return_or_inplace(self._column.is_hex()) def istimestamp(self, format: str) -> SeriesOrIndex: """ @@ -1404,9 +1342,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace( - str_cast.istimestamp(self._column, format) - ) + return self._return_or_inplace(self._column.is_timestamp(format)) def isfloat(self) -> SeriesOrIndex: r""" @@ -1957,7 +1893,7 @@ def isipv4(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(str_cast.is_ipv4(self._column)) + return self._return_or_inplace(self._column.is_ipv4()) def lower(self) -> SeriesOrIndex: """ @@ -5822,26 +5758,38 @@ def __contains__(self, item: ScalarLike) -> bool: other = [item] if is_scalar(item) else item return self.contains(column.as_column(other, dtype=self.dtype)).any() - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: out_dtype = cudf.api.types.dtype(dtype) - string_col = self - if out_dtype.kind in {"i", "u"}: - if not string_col.is_integer().all(): + if out_dtype.kind == "b": + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + self.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + elif out_dtype.kind in {"i", "u"}: + if not self.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) + cast_func = plc.strings.convert.convert_integers.to_integers elif out_dtype.kind == "f": - if not string_col.is_float().all(): + if not self.is_float().all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." ) - - result_col = _str_to_numeric_typecast_functions[out_dtype](string_col) - return result_col + cast_func = plc.strings.convert.convert_floats.to_floats + else: + raise ValueError( + f"dtype must be a numerical type, not {out_dtype}" + ) + plc_dtype = dtype_to_pylibcudf_type(out_dtype) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + cast_func(self.to_pylibcudf(mode="read"), plc_dtype) + ) def strptime( self, dtype: Dtype, format: str @@ -5876,23 +5824,27 @@ def strptime( raise NotImplementedError( "Cannot parse date-like strings with different formats" ) - valid_ts = str_cast.istimestamp(self, format) + valid_ts = self.is_timestamp(format) valid = valid_ts | is_nat if not valid.all(): raise ValueError(f"Column contains invalid data for {format=}") - casting_func = str_cast.timestamp2int + casting_func = plc.strings.convert.convert_datetime.to_timestamps add_back_nat = is_nat.any() elif dtype.kind == "m": # type: ignore[union-attr] - casting_func = str_cast.timedelta2int + casting_func = plc.strings.convert.convert_durations.to_durations add_back_nat = False - result_col = casting_func(self, dtype, format) + with acquire_spill_lock(): + plc_dtype = dtype_to_pylibcudf_type(dtype) + result_col = type(self).from_pylibcudf( + casting_func(self.to_pylibcudf(mode="read"), plc_dtype, format) + ) if add_back_nat: result_col[is_nat] = None - return result_col + return result_col # type: ignore[return-value] def as_datetime_column( self, dtype: Dtype @@ -6394,15 +6346,15 @@ def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: ) ) + @acquire_spill_lock() def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: """ Helper function for methods that modify characters e.g. to_lower """ - with acquire_spill_lock(): - plc_column = method(self.to_pylibcudf(mode="read")) - return cast(Self, Column.from_pylibcudf(plc_column)) + plc_column = method(self.to_pylibcudf(mode="read")) + return cast(Self, Column.from_pylibcudf(plc_column)) def to_lower(self) -> Self: return self._modify_characters(plc.strings.case.to_lower) @@ -6431,6 +6383,46 @@ def replace_multiple(self, pattern: Self, replacements: Self) -> Self: ) return cast(Self, Column.from_pylibcudf(plc_result)) + @acquire_spill_lock() + def is_hex(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_integers.is_hex( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def hex_to_integers(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_integers.hex_to_integers( + self.to_pylibcudf(mode="read"), plc.DataType(plc.TypeId.INT64) + ) + ) + + @acquire_spill_lock() + def is_ipv4(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_ipv4.is_ipv4( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def ipv4_to_integers(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_ipv4.ipv4_to_integers( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def is_timestamp(self, format: str) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_datetime.is_timestamp( + self.to_pylibcudf(mode="read"), format + ) + ) + @acquire_spill_lock() def _split_record_re( self, diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8b1515acae2..417fa99dac0 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -10,9 +10,10 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.core.column.column as column -import cudf.core.column.string as string from cudf.api.types import is_scalar from cudf.core._internals import binaryop, unary from cudf.core.buffer import Buffer, acquire_spill_lock @@ -297,9 +298,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: column.column_empty(0, dtype="object"), ) else: - return string._timedelta_to_str_typecast_functions[self.dtype]( - self, format=format - ) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_durations.from_durations( + self.to_pylibcudf(mode="read"), format + ) + ) def as_string_column(self) -> cudf.core.column.StringColumn: return self.strftime("%D days %H:%M:%S") diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 40348461f8c..6d3dc2dc7d9 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,7 +8,6 @@ import pandas as pd import cudf -from cudf import _lib as libcudf from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core._internals import unary from cudf.core.column import as_column @@ -251,9 +250,9 @@ def _convert_str_col( return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value] else: if errors == "coerce": - converted_col = libcudf.string_casting.stod(converted_col) non_numerics = is_float.unary_operator("not") converted_col[non_numerics] = None + converted_col = converted_col.astype(np.dtype(np.float64)) # type: ignore[assignment] return converted_col # type: ignore[return-value] else: raise ValueError("Unable to convert some strings to numerics.")