Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move cudf._lib.search to cudf.core._internals #17411

Merged
merged 4 commits into from
Nov 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ set(cython_sources
rolling.pyx
round.pyx
scalar.pyx
search.pyx
sort.pyx
stream_compaction.pyx
string_casting.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
reshape,
rolling,
round,
search,
sort,
stream_compaction,
string_casting,
Expand Down
68 changes: 0 additions & 68 deletions python/cudf/cudf/_lib/search.pyx

This file was deleted.

56 changes: 56 additions & 0 deletions python/cudf/cudf/core/_internals/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from __future__ import annotations

from typing import TYPE_CHECKING, Literal

import pylibcudf as plc

from cudf._lib.column import Column
from cudf.core.buffer import acquire_spill_lock

if TYPE_CHECKING:
from cudf.core.column import ColumnBase


@acquire_spill_lock()
def search_sorted(
source: list[ColumnBase],
values: list[ColumnBase],
side: Literal["left", "right"],
ascending: bool = True,
na_position: Literal["first", "last"] = "last",
) -> ColumnBase:
"""Find indices where elements should be inserted to maintain order

Parameters
----------
source : list of columns
List of columns to search in
values : List of columns
List of value columns to search for
side : str {'left', 'right'} optional
If 'left', the index of the first suitable location is given.
If 'right', return the last such index
"""
# Note: We are ignoring index columns here
column_order = [
plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
] * len(source)
null_precedence = [
plc.types.NullOrder.AFTER
if na_position == "last"
else plc.types.NullOrder.BEFORE
] * len(source)

func = getattr(
plc.search,
"lower_bound" if side == "left" else "upper_bound",
)
return Column.from_pylibcudf(
func(
plc.Table([col.to_pylibcudf(mode="read") for col in source]),
plc.Table([col.to_pylibcudf(mode="read") for col in values]),
column_order,
null_precedence,
)
)
23 changes: 20 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ def indices_of(
raise ValueError("value must be a scalar")
else:
value = as_column(value, dtype=self.dtype, length=1)
mask = libcudf.search.contains(value, self)
mask = value.contains(self)
return apply_boolean_mask(
[as_column(range(0, len(self)), dtype=size_type_dtype)], mask
)[0]
Expand Down Expand Up @@ -914,7 +914,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
# self.isin(other) asks "which values of self are in other"
# contains(haystack, needles) asks "which needles are in haystack"
# hence this argument ordering.
result = libcudf.search.contains(rhs, self)
result = rhs.contains(self)
if self.null_count > 0:
# If one of the needles is null, then the result contains
# nulls, these nulls should be replaced by whether or not the
Expand Down Expand Up @@ -956,6 +956,23 @@ def is_monotonic_decreasing(self) -> bool:
[self], [False], None
)

def contains(self, other: ColumnBase) -> ColumnBase:
"""
Check whether column contains multiple values.

Parameters
----------
other : Column
A column of values to search for
"""
with acquire_spill_lock():
return Column.from_pylibcudf(
plc.search.contains(
self.to_pylibcudf(mode="read"),
other.to_pylibcudf(mode="read"),
)
)

def sort_values(
self: Self,
ascending: bool = True,
Expand Down Expand Up @@ -1190,7 +1207,7 @@ def searchsorted(
raise ValueError(
"Column searchsorted expects values to be column of same dtype"
)
return libcudf.search.search_sorted(
return cudf.core._internals.search.search_sorted( # type: ignore[return-value]
[self],
[value],
side=side,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

import cudf
from cudf import _lib as libcudf
from cudf._lib.search import search_sorted
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals import unary
from cudf.core._internals.search import search_sorted
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
get_compatible_timezone,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def __contains__(self, item: ScalarLike) -> bool:
except (TypeError, ValueError):
return False
# TODO: Use `scalar`-based `contains` wrapper
return libcudf.search.contains(
self, column.as_column([search_item], dtype=self.dtype)
return self.contains(
column.as_column([search_item], dtype=self.dtype)
).any()

def indices_of(self, value: ScalarLike) -> NumericalColumn:
Expand Down
9 changes: 3 additions & 6 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5858,13 +5858,10 @@ def sum(

def __contains__(self, item: ScalarLike) -> bool:
if is_scalar(item):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably easier to write this as other = [item] if is_scalar(item) else item.

return True in libcudf.search.contains(
self, column.as_column([item], dtype=self.dtype)
)
other = [item]
else:
return True in libcudf.search.contains(
self, column.as_column(item, dtype=self.dtype)
)
other = item
return self.contains(column.as_column(other, dtype=self.dtype)).any()

def as_numerical_column(
self, dtype: Dtype
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from collections import abc
from typing import TYPE_CHECKING, Any, Literal

# TODO: The `numpy` import is needed for typing purposes during doc builds
# only, need to figure out why the `np` alias is insufficient then remove.
import cupy
import numpy
import numpy as np
Expand All @@ -19,9 +17,13 @@
import pylibcudf as plc

import cudf

# TODO: The `numpy` import is needed for typing purposes during doc builds
# only, need to figure out why the `np` alias is insufficient then remove.
from cudf import _lib as libcudf
from cudf.api.types import is_dtype_equal, is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core._internals.search import search_sorted
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
ColumnBase,
Expand Down Expand Up @@ -1302,7 +1304,7 @@ def searchsorted(
for val, common_dtype in zip(values, common_dtype_list)
]

outcol = libcudf.search.search_sorted(
outcol = search_sorted(
sources,
values,
side,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import cudf
from cudf import _lib as libcudf
from cudf._lib.filling import sequence
from cudf._lib.search import search_sorted
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import (
Expand All @@ -32,6 +31,7 @@
)
from cudf.core._base_index import BaseIndex, _return_get_indexer_result
from cudf.core._compat import PANDAS_LT_300
from cudf.core._internals.search import search_sorted
from cudf.core.column import (
CategoricalColumn,
ColumnBase,
Expand Down
Loading