Skip to content

Commit

Permalink
Fix to_numeric fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 21, 2024
1 parent 346d935 commit dc5a8c6
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 30 deletions.
79 changes: 52 additions & 27 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import pylibcudf as plc

Expand Down Expand Up @@ -1096,26 +1097,21 @@ def replace(
"`pat` and `repl` are list-like inputs"
)

with acquire_spill_lock():
if regex:
if regex:
with acquire_spill_lock():
plc_result = plc.strings.replace_re.replace_re(
self._column.to_pylibcudf(mode="read"),
list(pat),
column.as_column(repl, dtype="str").to_pylibcudf(
mode="read"
),
)
else:
plc_result = plc.strings.replace.replace_multiple(
self._column.to_pylibcudf(mode="read"),
column.as_column(pat, dtype="str").to_pylibcudf(
mode="read"
),
column.as_column(repl, dtype="str").to_pylibcudf(
mode="read"
),
)
result = Column.from_pylibcudf(plc_result)
result = Column.from_pylibcudf(plc_result)
else:
result = self._column.replace_multiple(
cast(StringColumn, column.as_column(pat, dtype="str")),
cast(StringColumn, column.as_column(repl, dtype="str")),
)
return self._return_or_inplace(result)
# Pandas treats 0 as all
if n == 0:
Expand Down Expand Up @@ -1959,14 +1955,6 @@ def isipv4(self) -> SeriesOrIndex:
"""
return self._return_or_inplace(str_cast.is_ipv4(self._column))

def _modify_characters(
self, method: Callable[[plc.Column], plc.Column]
) -> SeriesOrIndex:
with acquire_spill_lock():
plc_column = method(self._column.to_pylibcudf(mode="read"))
result = Column.from_pylibcudf(plc_column)
return self._return_or_inplace(result)

def lower(self) -> SeriesOrIndex:
"""
Converts all characters to lowercase.
Expand Down Expand Up @@ -2006,7 +1994,7 @@ def lower(self) -> SeriesOrIndex:
3 swapcase
dtype: object
"""
return self._modify_characters(plc.strings.case.to_lower)
return self._return_or_inplace(self._column.to_lower())

def upper(self) -> SeriesOrIndex:
"""
Expand Down Expand Up @@ -2057,7 +2045,7 @@ def upper(self) -> SeriesOrIndex:
3 SWAPCASE
dtype: object
"""
return self._modify_characters(plc.strings.case.to_upper)
return self._return_or_inplace(self._column.to_upper())

def capitalize(self) -> SeriesOrIndex:
"""
Expand Down Expand Up @@ -2085,7 +2073,7 @@ def capitalize(self) -> SeriesOrIndex:
1 Goodbye, friend
dtype: object
"""
return self._modify_characters(plc.strings.capitalize.capitalize)
return self._return_or_inplace(self._column.capitalize())

def swapcase(self) -> SeriesOrIndex:
"""
Expand Down Expand Up @@ -2132,7 +2120,7 @@ def swapcase(self) -> SeriesOrIndex:
3 sWaPcAsE
dtype: object
"""
return self._modify_characters(plc.strings.case.swapcase)
return self._return_or_inplace(self._column.swapcase())

def title(self) -> SeriesOrIndex:
"""
Expand Down Expand Up @@ -2179,7 +2167,7 @@ def title(self) -> SeriesOrIndex:
3 Swapcase
dtype: object
"""
return self._modify_characters(plc.strings.capitalize.title)
return self._return_or_inplace(self._column.title())

def istitle(self) -> SeriesOrIndex:
"""
Expand All @@ -2205,7 +2193,7 @@ def istitle(self) -> SeriesOrIndex:
3 False
dtype: bool
"""
return self._modify_characters(plc.strings.capitalize.is_title)
return self._return_or_inplace(self._column.is_title())

def filter_alphanum(
self, repl: str | None = None, keep: bool = True
Expand Down Expand Up @@ -6327,3 +6315,40 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
)

return to_view.view(dtype)

def _modify_characters(
self, method: Callable[[plc.Column], plc.Column]
) -> Self:
"""
Helper function for methods that modify characters e.g. to_lower
"""
with acquire_spill_lock():
plc_column = method(self.to_pylibcudf(mode="read"))
return cast(Self, Column.from_pylibcudf(plc_column))

def to_lower(self) -> Self:
return self._modify_characters(plc.strings.case.to_lower)

def to_upper(self) -> Self:
return self._modify_characters(plc.strings.case.to_upper)

def capitalize(self) -> Self:
return self._modify_characters(plc.strings.capitalize.capitalize)

def swapcase(self) -> Self:
return self._modify_characters(plc.strings.case.swapcase)

def title(self) -> Self:
return self._modify_characters(plc.strings.capitalize.title)

def is_title(self) -> Self:
return self._modify_characters(plc.strings.capitalize.is_title)

def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
with acquire_spill_lock():
plc_result = plc.strings.replace.replace_multiple(
self.to_pylibcudf(mode="read"),
pattern.to_pylibcudf(mode="read"),
replacements.to_pylibcudf(mode="read"),
)
return cast(Self, Column.from_pylibcudf(plc_result))
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,12 +242,11 @@ def _convert_str_col(col, errors, _downcast=None):

def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
"""Handles empty and infinity strings"""
col = libstrings.to_lower(col)
col = col.to_lower() # type: ignore[attr-defined]
col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
# TODO: This can be handled by libcudf in
# future see StringColumn.as_numerical_column
col = libstrings.replace_multi(
col,
col = col.replace_multiple( # type: ignore[attr-defined]
as_column(["+", "inf", "inity"]),
as_column(["", "Inf", ""]),
)
Expand Down

0 comments on commit dc5a8c6

Please sign in to comment.