Skip to content

Commit

Permalink
Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) (rapidsai#16712)
Browse files Browse the repository at this point in the history
Before when `columns=` was a `cudf.Series/Index` we would call `return array.unique.to_pandas()`, but `.unique` is a method not a property so this would have raised an error.

Also took the time to refactor the helper methods here and push down the `errors=` keyword to `Frame._drop_column`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai#16712
  • Loading branch information
mroeschke authored and rjzamora committed Sep 25, 2024
1 parent 4d28db7 commit 9aa5aca
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 27 deletions.
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,11 +767,15 @@ def fillna(
)

@_performance_tracking
def _drop_column(self, name):
"""Drop a column by *name*"""
if name not in self._data:
raise KeyError(f"column '{name}' does not exist")
del self._data[name]
def _drop_column(
self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
) -> None:
"""Drop a column by *name* inplace."""
try:
del self._data[name]
except KeyError as err:
if errors != "ignore":
raise KeyError(f"column '{name}' does not exist") from err

@_performance_tracking
def _quantile_table(
Expand Down
32 changes: 10 additions & 22 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from __future__ import annotations

import numbers
import operator
import textwrap
import warnings
Expand Down Expand Up @@ -150,24 +149,14 @@
)


def _get_host_unique(array):
def _get_unique_drop_labels(array):
"""Return labels to be dropped for IndexFrame.drop."""
if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
return array.unique.to_pandas()
elif isinstance(array, (str, numbers.Number)):
return [array]
yield from np.unique(as_column(array).values_host)
elif is_scalar(array):
yield array
else:
return set(array)


def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
for c in columns:
try:
f._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
yield from set(array)


def _indices_from_labels(obj, labels):
Expand Down Expand Up @@ -5262,15 +5251,14 @@ def drop(
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(out, target, errors)
for label in _get_unique_drop_labels(target):
out._drop_column(label, errors=errors)
elif axis in (0, "index"):
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(dropped, columns, errors)
for label in _get_unique_drop_labels(columns):
dropped._drop_column(label, errors=errors)

out._mimic_inplace(dropped, inplace=True)

Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace):
assert_eq(expected, actual)


@pytest.mark.parametrize("obj", ["Index", "Series"])
def test_drop_cudf_obj_columns(obj):
pdf = pd.DataFrame({"A": [1], "B": [1]})
gdf = cudf.from_pandas(pdf)

columns = ["B"]
expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1)
actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1)
assert_eq(expected, actual)


@pytest.mark.parametrize(
"pdf",
[
Expand Down

0 comments on commit 9aa5aca

Please sign in to comment.