Skip to content

Commit

Permalink
Fix Index.difference to handle duplicate values when one of the inp…
Browse files Browse the repository at this point in the history
…uts is empty (rapidsai#15016)

This PR removes duplicate values in two short-circuit code-paths of `Index.difference` which is already fixed in `pandas-2.2`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: rapidsai#15016
  • Loading branch information
galipremsagar authored Feb 10, 2024
1 parent 0c0c7e6 commit 8edbeca
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,12 +1096,12 @@ def difference(self, other, sort=None):
other = cudf.Index(other, name=getattr(other, "name", self.name))

if not len(other):
res = self._get_reconciled_name_object(other)
res = self._get_reconciled_name_object(other).unique()
if sort:
return res.sort_values()
return res
elif self.equals(other):
res = self[:0]._get_reconciled_name_object(other)
res = self[:0]._get_reconciled_name_object(other).unique()
if sort:
return res.sort_values()
return res
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype
from cudf.core._compat import PANDAS_GE_200
from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_220
from cudf.core.index import (
CategoricalIndex,
DatetimeIndex,
Expand Down Expand Up @@ -797,9 +797,26 @@ def test_index_to_series(data):
"name_data,name_other",
[("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
)
def test_index_difference(data, other, sort, name_data, name_other):
def test_index_difference(request, data, other, sort, name_data, name_other):
pd_data = pd.Index(data, name=name_data)
pd_other = pd.Index(other, name=name_other)
request.applymarker(
pytest.mark.xfail(
condition=PANDAS_GE_220
and isinstance(pd_data.dtype, pd.CategoricalDtype)
and not isinstance(pd_other.dtype, pd.CategoricalDtype)
and pd_other.isnull().any(),
reason="https://github.com/pandas-dev/pandas/issues/57318",
)
)
request.applymarker(
pytest.mark.xfail(
condition=not PANDAS_GE_220
and len(pd_other) == 0
and len(pd_data) != len(pd_data.unique()),
reason="Bug fixed in pandas-2.2+",
)
)

gd_data = cudf.from_pandas(pd_data)
gd_other = cudf.from_pandas(pd_other)
Expand Down

0 comments on commit 8edbeca

Please sign in to comment.