Skip to content

Commit

Permalink
Align IntervalIndex APIs with pandas 2.x (#16371)
Browse files Browse the repository at this point in the history
Implemented the relatively straightforward, missing APIs and raised `NotImplementedError` for the others

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #16371
  • Loading branch information
mroeschke authored Jul 31, 2024
1 parent e2d45d6 commit dab8660
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 6 deletions.
15 changes: 10 additions & 5 deletions docs/cudf/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,15 +559,20 @@ def on_missing_reference(app, env, node, contnode):
("py:obj", "cudf.DatetimeIndex.time"),
("py:obj", "cudf.DatetimeIndex.date"),
("py:obj", "cudf.Index.values_host"),
("py:class", "pa.Array"),
("py:class", "ScalarLike"),
("py:class", "ParentType"),
("py:class", "ColumnLike"),
("py:class", "ColumnLike"),
("py:obj", "cudf.Index.transpose"),
("py:obj", "cudf.Index.T"),
("py:obj", "cudf.Index.to_flat_index"),
("py:obj", "cudf.MultiIndex.to_flat_index"),
("py:meth", "pyarrow.Table.to_pandas"),
("py:class", "pa.Array"),
("py:class", "ScalarLike"),
("py:class", "ParentType"),
("py:class", "pyarrow.lib.DataType"),
("py:class", "pyarrow.lib.Table"),
("py:class", "pyarrow.lib.Scalar"),
("py:class", "pyarrow.lib.ChunkedArray"),
("py:class", "pyarrow.lib.Array"),
("py:class", "ColumnLike"),
# TODO: Remove this when we figure out why typing_extensions doesn't seem
# to map types correctly for intersphinx
("py:class", "typing_extensions.Self"),
Expand Down
64 changes: 63 additions & 1 deletion python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
from __future__ import annotations

from typing import TYPE_CHECKING, Literal

import pandas as pd
import pyarrow as pa

import cudf
from cudf.core.column import StructColumn
from cudf.core.column import StructColumn, as_column
from cudf.core.dtypes import IntervalDtype

if TYPE_CHECKING:
from cudf.core.column import ColumnBase


class IntervalColumn(StructColumn):
def __init__(
Expand Down Expand Up @@ -85,6 +92,61 @@ def copy(self, deep=True):
children=struct_copy.base_children,
)

@property
def is_empty(self) -> ColumnBase:
left_equals_right = (self.right == self.left).fillna(False)
not_closed_both = as_column(
self.dtype.closed != "both", length=len(self)
)
return left_equals_right & not_closed_both

@property
def is_non_overlapping_monotonic(self) -> bool:
raise NotImplementedError(
"is_overlapping is currently not implemented."
)

@property
def is_overlapping(self) -> bool:
raise NotImplementedError(
"is_overlapping is currently not implemented."
)

@property
def length(self) -> ColumnBase:
return self.right - self.left

@property
def left(self) -> ColumnBase:
return self.children[0]

@property
def mid(self) -> ColumnBase:
try:
return 0.5 * (self.left + self.right)
except TypeError:
# datetime safe version
return self.left + 0.5 * self.length

@property
def right(self) -> ColumnBase:
return self.children[1]

def overlaps(other) -> ColumnBase:
raise NotImplementedError("overlaps is not currently implemented.")

def set_closed(
self, closed: Literal["left", "right", "both", "neither"]
) -> IntervalColumn:
return IntervalColumn(
size=self.size,
dtype=IntervalDtype(self.dtype.fields["left"], closed),
mask=self.base_mask,
offset=self.offset,
null_count=self.null_count,
children=self.base_children,
)

def as_interval_column(self, dtype):
if isinstance(dtype, IntervalDtype):
return IntervalColumn(
Expand Down
123 changes: 123 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3429,6 +3429,31 @@ def from_breaks(
)
return IntervalIndex(interval_col, name=name, closed=closed)

@classmethod
def from_arrays(
cls,
left,
right,
closed: Literal["left", "right", "both", "neither"] = "right",
copy: bool = False,
dtype=None,
) -> Self:
raise NotImplementedError("from_arrays is currently not supported.")

@classmethod
def from_tuples(
cls,
data,
closed: Literal["left", "right", "both", "neither"] = "right",
name=None,
copy: bool = False,
dtype=None,
) -> IntervalIndex:
piidx = pd.IntervalIndex.from_tuples(
data, closed=closed, name=name, copy=copy, dtype=dtype
)
return cls.from_pandas(piidx)

def __getitem__(self, index):
raise NotImplementedError(
"Getting a scalar from an IntervalIndex is not yet supported"
Expand All @@ -3443,6 +3468,104 @@ def _is_boolean(self):
def _clean_nulls_from_index(self):
return self

@property
def is_empty(self) -> cupy.ndarray:
"""
Indicates if an interval is empty, meaning it contains no points.
"""
return self._column.is_empty.values

@property
def is_non_overlapping_monotonic(self) -> bool:
"""
Return a True if the IntervalIndex is non-overlapping and monotonic.
"""
return self._column.is_non_overlapping_monotonic

@property
def is_overlapping(self) -> bool:
"""
Return True if the IntervalIndex has overlapping intervals, else False.
Currently not implemented
"""
return self._column.is_overlapping

@property
def length(self) -> Index:
"""
Return an Index with entries denoting the length of each Interval.
"""
return _index_from_data({None: self._column.length})

@property
def left(self) -> Index:
"""
Return left bounds of the intervals in the IntervalIndex.
The left bounds of each interval in the IntervalIndex are
returned as an Index. The datatype of the left bounds is the
same as the datatype of the endpoints of the intervals.
"""
return _index_from_data({None: self._column.left})

@property
def mid(self) -> Index:
"""
Return the midpoint of each interval in the IntervalIndex as an Index.
Each midpoint is calculated as the average of the left and right bounds
of each interval.
"""
return _index_from_data({None: self._column.mid})

@property
def right(self) -> Index:
"""
Return right bounds of the intervals in the IntervalIndex.
The right bounds of each interval in the IntervalIndex are
returned as an Index. The datatype of the right bounds is the
same as the datatype of the endpoints of the intervals.
"""
return _index_from_data({None: self._column.right})

def overlaps(self, other) -> cupy.ndarray:
"""
Check elementwise if an Interval overlaps the values in the IntervalIndex.
Currently not supported.
"""
return self._column.overlaps(other).values

def set_closed(
self, closed: Literal["left", "right", "both", "neither"]
) -> Self:
"""
Return an identical IntervalArray closed on the specified side.
Parameters
----------
closed : {'left', 'right', 'both', 'neither'}
Whether the intervals are closed on the left-side, right-side, both
or neither.
"""
return type(self)._from_data(
{self.name: self._column.set_closed(closed)}
)

def to_tuples(self, na_tuple: bool = True) -> pd.Index:
"""
Return an Index of tuples of the form (left, right).
Parameters
----------
na_tuple : bool, default True
If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``,
just return ``NA`` as ``nan``.
"""
return self.to_pandas().to_tuples(na_tuple=na_tuple)


@_performance_tracking
def as_index(
Expand Down
33 changes: 33 additions & 0 deletions python/cudf/cudf/tests/indexes/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,36 @@ def test_intervalindex_conflicting_closed():
def test_intervalindex_invalid_data():
with pytest.raises(TypeError):
cudf.IntervalIndex([1, 2])


@pytest.mark.parametrize(
"attr",
[
"is_empty",
"length",
"left",
"right",
"mid",
],
)
def test_intervalindex_properties(attr):
pd_ii = pd.IntervalIndex.from_arrays([0, 1], [0, 2])
cudf_ii = cudf.from_pandas(pd_ii)

result = getattr(cudf_ii, attr)
expected = getattr(pd_ii, attr)
assert_eq(result, expected)


def test_set_closed():
data = [pd.Interval(0, 1)]
result = cudf.IntervalIndex(data).set_closed("both")
expected = pd.IntervalIndex(data).set_closed("both")
assert_eq(result, expected)


def test_from_tuples():
data = [(1, 2), (10, 20)]
result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
assert_eq(result, expected)

0 comments on commit dab8660

Please sign in to comment.