From 6c70f3de2d9e1dbb2f6e2d389118cb3044325311 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Sep 2023 15:37:23 -0700 Subject: [PATCH 1/5] Fix Index.difference --- python/cudf/cudf/core/_base_index.py | 12 ++++++++++-- python/cudf/cudf/tests/test_index.py | 3 +++ python/cudf/cudf/utils/dtypes.py | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 829ca33d8a5..8091f3f7dd2 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -30,7 +30,7 @@ from cudf.core.column import ColumnBase, column from cudf.core.column_accessor import ColumnAccessor from cudf.utils import ioutils -from cudf.utils.dtypes import is_mixed_with_object_dtype +from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype from cudf.utils.utils import _is_same_name @@ -935,13 +935,21 @@ def difference(self, other, sort=None): >>> idx1.difference(idx2, sort=False) Int64Index([2, 1], dtype='int64') """ + if not can_convert_to_column(other): + raise TypeError("Input must be Index or array-like") + if sort not in {None, False}: raise ValueError( f"The 'sort' keyword only takes the values " f"of None or False; {sort} was passed." ) - other = cudf.Index(other) + other = cudf.Index(other, name=getattr(other, "name", self.name)) + + if not len(other): + return self._get_reconciled_name_object(other) + elif self.equals(other): + return self[:0]._get_reconciled_name_object(other) res_name = _get_result_name(self.name, other.name) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 359b3c519de..9c822c5ad9a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -788,6 +788,8 @@ def test_index_to_series(data): ["5", "6", "2", "a", "b", "c"], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [1.0, 5.0, 6.0, 0.0, 1.3], + pd.Series(["1", "2", "a", "3", None], dtype="category"), + [], ], ) @pytest.mark.parametrize( @@ -800,6 +802,7 @@ def test_index_to_series(data): ["5", "6", "2", "a", "b", "c"], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [1.0, 5.0, 6.0, 0.0, 1.3], + pd.Series(["1", "2", "a", "3", None], dtype="category"), [], ], ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ea96a0859ce..e50457b8e7b 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -426,6 +426,11 @@ def get_min_float_dtype(col): def is_mixed_with_object_dtype(lhs, rhs): + if cudf.api.types.is_categorical_dtype(lhs.dtype): + return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) + elif cudf.api.types.is_categorical_dtype(rhs.dtype): + return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) + return (lhs.dtype == "object" and rhs.dtype != "object") or ( rhs.dtype == "object" and lhs.dtype != "object" ) From 611c79b1d23775af5b8c870ee5b19e7bab132599 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Sep 2023 17:01:28 -0700 Subject: [PATCH 2/5] Fix Index.difference --- python/cudf/cudf/core/index.py | 23 +++++++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 5 +++++ 2 files changed, 28 insertions(+) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index c7e25cdc430..518f3e5e4db 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -724,6 +724,29 @@ def _intersection(self, other, sort=False): return new_index + @_cudf_nvtx_annotate + def difference(self, other, sort=None): + if isinstance(other, RangeIndex) and self.equals(other): + return self[:0]._get_reconciled_name_object(other) + + return self._try_reconstruct_range_index( + super().difference(other, sort=sort) + ) + + def _try_reconstruct_range_index(self, index): + if index.dtype.kind == "f": + return index + # Evenly spaced values can return a + # RangeIndex instead of Int64Index + unique_diffs = index.to_frame(name="None").diff()["None"].unique() + if len(unique_diffs) == 2 and ( + unique_diffs[0] is cudf.NA and unique_diffs[1] != 0 + ): + diff = unique_diffs[1] + new_range = range(index[0], index[-1] + diff, diff) + return type(self)(new_range, name=index.name) + return index + def sort_values( self, return_indexer=False, diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 9c822c5ad9a..5d21258b6e5 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -788,7 +788,9 @@ def test_index_to_series(data): ["5", "6", "2", "a", "b", "c"], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [1.0, 5.0, 6.0, 0.0, 1.3], + ["ab", "cd", "ef"], pd.Series(["1", "2", "a", "3", None], dtype="category"), + range(0, 10), [], ], ) @@ -800,8 +802,10 @@ def test_index_to_series(data): [10, 20, 30, 40, 50, 60], ["1", "2", "3", "4", "5", "6"], ["5", "6", "2", "a", "b", "c"], + ["ab", "ef", None], [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [1.0, 5.0, 6.0, 0.0, 1.3], + range(2, 4), pd.Series(["1", "2", "a", "3", None], dtype="category"), [], ], @@ -820,6 +824,7 @@ def test_index_difference(data, other, sort, name_data, name_other): expected = pd_data.difference(pd_other, sort=sort) actual = gd_data.difference(gd_other, sort=sort) + assert_eq(expected, actual) From a5030d808ecd06d0725d286012e233cecf320c0b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 6 Sep 2023 17:03:26 -0700 Subject: [PATCH 3/5] slight change in test --- python/cudf/cudf/tests/test_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 5d21258b6e5..6f5480d6a5e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -806,7 +806,7 @@ def test_index_to_series(data): [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [1.0, 5.0, 6.0, 0.0, 1.3], range(2, 4), - pd.Series(["1", "2", "a", "3", None], dtype="category"), + pd.Series(["1", "a", "3", None], dtype="category"), [], ], ) From 14429887b6b58945bd9acf20efad63fadb5a1d39 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Sep 2023 17:12:00 -0700 Subject: [PATCH 4/5] use cupy --- python/cudf/cudf/core/index.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 518f3e5e4db..4bb5428838f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -734,17 +734,16 @@ def difference(self, other, sort=None): ) def _try_reconstruct_range_index(self, index): - if index.dtype.kind == "f": + if isinstance(index, RangeIndex) or index.dtype.kind == "f": return index # Evenly spaced values can return a - # RangeIndex instead of Int64Index - unique_diffs = index.to_frame(name="None").diff()["None"].unique() - if len(unique_diffs) == 2 and ( - unique_diffs[0] is cudf.NA and unique_diffs[1] != 0 - ): - diff = unique_diffs[1] - new_range = range(index[0], index[-1] + diff, diff) - return type(self)(new_range, name=index.name) + # RangeIndex instead of a materialized Index. + if not index._column.has_nulls(): + uniques = cupy.unique(cupy.diff(index.values)) + if len(uniques) == 1 and uniques[0].get() != 0: + diff = uniques[0].get() + new_range = range(index[0], index[-1] + diff, diff) + return type(self)(new_range, name=index.name) return index def sort_values( From ea46a79ba4ac472485992277706a75c992601185 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 7 Sep 2023 17:14:45 -0700 Subject: [PATCH 5/5] Add tests for invalid cases --- python/cudf/cudf/tests/test_index.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 77e9966aa2f..58dbc48e31e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -829,6 +829,19 @@ def test_index_difference(data, other, sort, name_data, name_other): assert_eq(expected, actual) +@pytest.mark.parametrize("other", ["a", 1, None]) +def test_index_difference_invalid_inputs(other): + pdi = pd.Index([1, 2, 3]) + gdi = cudf.Index([1, 2, 3]) + + assert_exceptions_equal( + pdi.difference, + gdi.difference, + ([other], {}), + ([other], {}), + ) + + def test_index_difference_sort_error(): pdi = pd.Index([1, 2, 3]) gdi = cudf.Index([1, 2, 3])