Skip to content

Commit

Permalink
Don't materialize column during RangeIndex methods (#15582)
Browse files Browse the repository at this point in the history
Additionally implements some methods that are defined on `BaseIndex` that were not implemented on `RangeIndex` and adds some typing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #15582
  • Loading branch information
mroeschke authored Apr 24, 2024
1 parent 8b4dc91 commit 70a5b2b
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 41 deletions.
10 changes: 8 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False):
"""
raise NotImplementedError

def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
raise NotImplementedError

def union(self, other, sort=None):
Expand Down Expand Up @@ -2061,7 +2061,13 @@ def dropna(self, how="any"):
one null value. "all" drops only rows containing
*all* null values.
"""

if how not in {"any", "all"}:
raise ValueError(f"{how=} must be 'any' or 'all'")
try:
if not self.hasnans:
return self.copy()
except NotImplementedError:
pass
# This is to be consistent with IndexedFrame.dropna to handle nans
# as nulls by default
data_columns = [
Expand Down
108 changes: 69 additions & 39 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import cupy
import numpy as np
import pandas as pd
import pyarrow as pa
from typing_extensions import Self

import cudf
Expand Down Expand Up @@ -248,6 +249,15 @@ def searchsorted(
), "Invalid ascending flag"
return search_range(value, self._range, side=side)

def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
if sort and self.step < 0:
codes = cupy.arange(len(self) - 1, -1, -1)
uniques = self[::-1]
else:
codes = cupy.arange(len(self), dtype=np.intp)
uniques = self
return codes, uniques

@property # type: ignore
@_cudf_nvtx_annotate
def name(self):
Expand All @@ -260,31 +270,31 @@ def name(self, value):

@property # type: ignore
@_cudf_nvtx_annotate
def start(self):
def start(self) -> int:
"""
The value of the `start` parameter (0 if this was not supplied).
"""
return self._range.start

@property # type: ignore
@_cudf_nvtx_annotate
def stop(self):
def stop(self) -> int:
"""
The value of the stop parameter.
"""
return self._range.stop

@property # type: ignore
@_cudf_nvtx_annotate
def step(self):
def step(self) -> int:
"""
The value of the step parameter.
"""
return self._range.step

@property # type: ignore
@_cudf_nvtx_annotate
def _num_rows(self):
def _num_rows(self) -> int:
return len(self)

@cached_property # type: ignore
Expand All @@ -295,33 +305,33 @@ def _values(self):
else:
return column.column_empty(0, masked=False, dtype=self.dtype)

def _clean_nulls_from_index(self):
def _clean_nulls_from_index(self) -> Self:
return self

def _is_numeric(self):
def _is_numeric(self) -> bool:
return True

def _is_boolean(self):
def _is_boolean(self) -> bool:
return False

def _is_integer(self):
def _is_integer(self) -> bool:
return True

def _is_floating(self):
def _is_floating(self) -> bool:
return False

def _is_object(self):
def _is_object(self) -> bool:
return False

def _is_categorical(self):
def _is_categorical(self) -> bool:
return False

def _is_interval(self):
def _is_interval(self) -> bool:
return False

@property # type: ignore
@_cudf_nvtx_annotate
def hasnans(self):
def hasnans(self) -> bool:
return False

@property # type: ignore
Expand Down Expand Up @@ -369,12 +379,15 @@ def astype(self, dtype, copy: bool = True):
return self
return self._as_int_index().astype(dtype, copy=copy)

def fillna(self, value, downcast=None):
return self.copy()

@_cudf_nvtx_annotate
def drop_duplicates(self, keep="first"):
return self

@_cudf_nvtx_annotate
def duplicated(self, keep="first"):
def duplicated(self, keep="first") -> cupy.ndarray:
return cupy.zeros(len(self), dtype=bool)

@_cudf_nvtx_annotate
Expand All @@ -390,6 +403,11 @@ def __repr__(self):
+ ")"
)

@property
@_cudf_nvtx_annotate
def size(self) -> int:
return len(self)

@_cudf_nvtx_annotate
def __len__(self):
return len(self._range)
Expand Down Expand Up @@ -478,12 +496,12 @@ def to_pandas(
)

@property
def is_unique(self):
def is_unique(self) -> bool:
return True

@cached_property # type: ignore
@_cudf_nvtx_annotate
def is_monotonic_increasing(self):
def is_monotonic_increasing(self) -> bool:
return self.step > 0 or len(self) <= 1

@cached_property # type: ignore
Expand All @@ -492,15 +510,15 @@ def is_monotonic_decreasing(self):
return self.step < 0 or len(self) <= 1

@_cudf_nvtx_annotate
def memory_usage(self, deep=False):
def memory_usage(self, deep: bool = False) -> int:
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return 0

def unique(self):
def unique(self) -> Self:
# RangeIndex always has unique values
return self

Expand Down Expand Up @@ -823,34 +841,37 @@ def _columns(self):

@property # type: ignore
@_cudf_nvtx_annotate
def values_host(self):
return self.to_pandas().values
def values_host(self) -> np.ndarray:
return np.arange(start=self.start, stop=self.stop, step=self.step)

@_cudf_nvtx_annotate
def argsort(
self,
ascending=True,
na_position="last",
):
) -> cupy.ndarray:
if na_position not in {"first", "last"}:
raise ValueError(f"invalid na_position: {na_position}")

indices = cupy.arange(0, len(self))
if (ascending and self.step < 0) or (not ascending and self.step > 0):
indices = indices[::-1]
return indices
return cupy.arange(len(self) - 1, -1, -1)
else:
return cupy.arange(len(self))

@_cudf_nvtx_annotate
def where(self, cond, other=None, inplace=False):
return self._as_int_index().where(cond, other, inplace)

@_cudf_nvtx_annotate
def to_numpy(self):
def to_numpy(self) -> np.ndarray:
return self.values_host

@_cudf_nvtx_annotate
def to_arrow(self):
return self._as_int_index().to_arrow()
def to_cupy(self) -> cupy.ndarray:
return self.values

@_cudf_nvtx_annotate
def to_arrow(self) -> pa.Array:
return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))

def __array__(self, dtype=None):
raise TypeError(
Expand All @@ -861,17 +882,17 @@ def __array__(self, dtype=None):
)

@_cudf_nvtx_annotate
def nunique(self):
def nunique(self) -> int:
return len(self)

@_cudf_nvtx_annotate
def isna(self):
def isna(self) -> cupy.ndarray:
return cupy.zeros(len(self), dtype=bool)

isnull = isna

@_cudf_nvtx_annotate
def notna(self):
def notna(self) -> cupy.ndarray:
return cupy.ones(len(self), dtype=bool)

notnull = isna
Expand All @@ -895,12 +916,15 @@ def max(self):
return self._minmax("max")

@property
def values(self):
def values(self) -> cupy.ndarray:
return cupy.arange(self.start, self.stop, self.step)

def any(self):
def any(self) -> bool:
return any(self._range)

def all(self) -> bool:
return 0 not in self._range

def append(self, other):
result = self._as_int_index().append(other)
return self._try_reconstruct_range_index(result)
Expand All @@ -926,14 +950,20 @@ def isin(self, values):

return self._values.isin(values).values

def __neg__(self):
return -self._as_int_index()
def __pos__(self) -> Self:
return self.copy()

def __pos__(self):
return +self._as_int_index()
def __neg__(self) -> Self:
rng = range(-self.start, -self.stop, -self.step)
return type(self)(rng, name=self.name)

def __abs__(self):
return abs(self._as_int_index())
def __abs__(self) -> Self | Index:
if len(self) == 0 or self.min() >= 0:
return self.copy()
elif self.max() <= 0:
return -self
else:
return abs(self._as_int_index())

@_warn_no_dask_cudf
def __dask_tokenize__(self):
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar):
result = idx.to_pandas(arrow_type=True)
expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
pd.testing.assert_index_equal(result, expected)


@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
def test_rangeindex_all(data):
result = cudf.RangeIndex(data).all()
expected = cudf.Index(list(data)).all()
assert result == expected


@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
def test_rangeindex_factorize(sort, data):
res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
assert_eq(res_codes, exp_codes)
assert_eq(res_uniques, exp_uniques)


def test_rangeindex_dropna():
ri = cudf.RangeIndex(range(2))
result = ri.dropna()
expected = ri.copy()
assert_eq(result, expected)

0 comments on commit 70a5b2b

Please sign in to comment.