Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add type stubs for pylibcudf #17258

Merged
merged 23 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
41c74c0
Fix pylibcudf isort sections
wence- Nov 6, 2024
2bea16a
Missing pxd signature for apply_boolean_mask
wence- Nov 6, 2024
2522e84
Import quote style as QuoteStyle
wence- Nov 6, 2024
6232dbf
Type stubs for pylibcudf
wence- Nov 6, 2024
af3dc64
Minor adaptations in response to type annotations in pylibcudf
wence- Nov 6, 2024
5eb87fb
Use typed enum for inclusive parameter in label_bins
wence- Nov 6, 2024
4ef56b8
Used typed enums for null/nan equality in list methods
wence- Nov 6, 2024
bd9f6f8
Add some guidance about type stubs
wence- Nov 6, 2024
1801379
Add __all__ to all pylibcudf modules
wence- Nov 7, 2024
7493605
Pylibcudf classes are typically not hashable
wence- Nov 8, 2024
a4e8617
Minor fixes in response to review
wence- Nov 8, 2024
635da9e
pylibcudf: enable flake8-tidy/type-checking rules
wence- Nov 8, 2024
1b68bfe
Catch some missing bits
wence- Nov 8, 2024
42be70b
Oops
wence- Nov 8, 2024
9f3f5fb
No need for __init__.pyi
wence- Nov 8, 2024
1dcf8ec
Minor fixes from work in progress checker
wence- Nov 8, 2024
10d632a
Merge branch 'branch-24.12' into wence/fea/pylibcudf-typing
vyasr Nov 8, 2024
6badafe
Merge branch 'branch-24.12' into wence/fea/pylibcudf-typing
wence- Nov 11, 2024
042b184
autodoc to document most pylibcudf enums correctly
wence- Nov 11, 2024
3aec31f
Typing for new features
wence- Nov 11, 2024
b0af63d
Missing cimport
wence- Nov 11, 2024
8607f94
Update tests and turn on ruff pytest lints in pylibcudf
wence- Nov 12, 2024
3ccfc50
ColumnMetadata fields have defaults
wence- Nov 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 66 additions & 7 deletions docs/cudf/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,18 @@
import tempfile
import warnings
import xml.etree.ElementTree as ET
from enum import IntEnum
from typing import Any

import cudf
from docutils.nodes import Text
from packaging.version import Version
from sphinx.addnodes import pending_xref
from sphinx.highlighting import lexers
from sphinx.ext import intersphinx
from pygments.lexer import RegexLexer
from pygments.token import Text as PText

import cudf
from sphinx.addnodes import pending_xref
from sphinx.ext import intersphinx
from sphinx.ext.autodoc import ClassDocumenter, bool_option
from sphinx.highlighting import lexers


class PseudoLexer(RegexLexer):
Expand Down Expand Up @@ -342,7 +344,10 @@ def clean_all_xml_files(path):
"cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
"cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
"cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
"DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
"DeviceBuffer": (
"rmm.pylibrmm.device_buffer.DeviceBuffer",
"rmm.DeviceBuffer",
),
}


Expand Down Expand Up @@ -373,7 +378,14 @@ def _generate_namespaces(namespaces):
_all_namespaces = _generate_namespaces(
{
# Note that io::datasource is actually a nested class
"cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
"cudf": {
"io",
"io::datasource",
"strings",
"ast",
"ast::expression",
"io::text",
},
"numeric": {},
"nvtext": {},
}
Expand Down Expand Up @@ -642,13 +654,60 @@ def linkcode_resolve(domain, info) -> str | None:
f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
)


# Needed for avoid build warning for PandasCompat extension
suppress_warnings = ["myst.domains"]


class PLCIntEnumDocumenter(ClassDocumenter):
objtype = "enum"
directivetype = "attribute"
priority = 10 + ClassDocumenter.priority

option_spec = dict(ClassDocumenter.option_spec)

@classmethod
def can_document_member(
cls, member: Any, membername: str, isattr: bool, parent: Any
) -> bool:
try:
return issubclass(
member, IntEnum
) and member.__module__.startswith("pylibcudf")
except TypeError:
return False

def add_directive_header(self, sig: str) -> None:
self.directivetype = "attribute"
super().add_directive_header(sig)

def add_content(self, more_content) -> None:
doc_as_attr = self.doc_as_attr
self.doc_as_attr = False
super().add_content(more_content)
self.doc_as_attr = doc_as_attr
source_name = self.get_sourcename()
enum_object: IntEnum = self.object

if self.object.__name__ != "Kind":
self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
self.add_line("", source_name)
self.add_line("Enum members", source_name)
self.add_line("", source_name)

for the_member_name in enum_object.__members__: # type: ignore[attr-defined]
self.add_line(
f"* ``{the_member_name}``", source_name
)
self.add_line("", source_name)


def setup(app):
app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
app.add_js_file(
"https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
)
app.connect("doctree-read", resolve_aliases)
app.connect("missing-reference", on_missing_reference)
app.setup_extension("sphinx.ext.autodoc")
app.add_autodocumenter(PLCIntEnumDocumenter)
73 changes: 72 additions & 1 deletion docs/cudf/source/developer_guide/pylibcudf.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
- All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
- All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
- Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.

- Type stubs are provided and generated manually. When adding new
functionality, ensure that the matching type stub is appropriately updated.

## Relationship to libcudf

Expand Down Expand Up @@ -249,3 +250,73 @@ In the event that libcudf provides multiple overloads for the same function with
and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.

Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.

### Type stubs

Since static type checkers like `mypy` and `pyright` cannot parse
Cython code, we provide type stubs for the pylibcudf package. These
are currently maintained manually, alongside the matching pylibcudf
files.

Every `pyx` file should have a matching `pyi` file that provides the
type stubs. Most functions can be exposed straightforwardly. Some
guiding principles:

- For typed integer arguments in libcudf, use `int` as a type
annotation.
- For functions which are annotated as a `list` in Cython, but the
function body does more detailed checking, try and encode the
detailed information in the type.
- For Cython fused types there are two options:
1. If the fused type appears only once in the function signature,
use a `Union` type;
2. If the fused type appears more than once (or as both an input
and output type), use a `TypeVar` with
the variants in the fused type provided as constraints.


As an example, `pylibcudf.copying.split` is typed in Cython as:

```cython
ctypedef fused ColumnOrTable:
Table
Column

cpdef list split(ColumnOrTable input, list splits): ...
```

Here we only have a single use of the fused type, and the `list`
arguments do not specify their values. Here, if we provide a `Column`
as input, we receive a `list[Column]` as output, and if we provide a
`Table` we receive `list[Table]` as output.

In the type stub, we can encode this with a `TypeVar`, we can also
provide typing for the `splits` argument that indicates that the split
values must be integers:

```python
ColumnOrTable = TypeVar("ColumnOrTable", Column, Table)

def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ...
```

Conversely, `pylibcudf.copying.scatter` uses a fused type only once in
its input:

```cython
ctypedef fused TableOrListOfScalars:
Table
list

cpdef Table scatter(
TableOrListOfScalars source, Column scatter_map, Table target
)
```

In the type stub, we can use a normal union in this case

```python
def scatter(
source: Table | list[Scalar], scatter_map: Column, target: Table
) -> Table: ...
```
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/labeling.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def label_bins(Column input, Column left_edges, cbool left_inclusive,
plc_column = plc.labeling.label_bins(
input.to_pylibcudf(mode="read"),
left_edges.to_pylibcudf(mode="read"),
left_inclusive,
plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
right_edges.to_pylibcudf(mode="read"),
right_inclusive
plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
)
return Column.from_pylibcudf(plc_column)
24 changes: 12 additions & 12 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool

from pylibcudf.libcudf.types cimport size_type
from pylibcudf.libcudf.types cimport (
nan_equality, null_equality, null_order, order, size_type
)

from cudf._lib.column cimport Column
from cudf._lib.utils cimport columns_from_pylibcudf_table
Expand Down Expand Up @@ -37,8 +39,8 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
return Column.from_pylibcudf(
plc.lists.distinct(
col.to_pylibcudf(mode="read"),
nulls_equal,
nans_all_equal,
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
)
)

Expand All @@ -48,12 +50,8 @@ def sort_lists(Column col, bool ascending, str na_position):
return Column.from_pylibcudf(
plc.lists.sort_lists(
col.to_pylibcudf(mode="read"),
ascending,
(
plc.types.NullOrder.BEFORE
if na_position == "first"
else plc.types.NullOrder.AFTER
),
order.ASCENDING if ascending else order.DESCENDING,
null_order.BEFORE if na_position == "first" else null_order.AFTER,
False,
)
)
Expand Down Expand Up @@ -95,7 +93,7 @@ def index_of_scalar(Column col, object py_search_key):
plc.lists.index_of(
col.to_pylibcudf(mode="read"),
<Scalar> py_search_key.device_value.c_value,
True,
plc.lists.DuplicateFindOption.FIND_FIRST,
)
)

Expand All @@ -106,7 +104,7 @@ def index_of_column(Column col, Column search_keys):
plc.lists.index_of(
col.to_pylibcudf(mode="read"),
search_keys.to_pylibcudf(mode="read"),
True,
plc.lists.DuplicateFindOption.FIND_FIRST,
)
)

Expand All @@ -127,7 +125,9 @@ def concatenate_list_elements(Column input_column, dropna=False):
return Column.from_pylibcudf(
plc.lists.concatenate_list_elements(
input_column.to_pylibcudf(mode="read"),
dropna,
plc.lists.ConcatenateNullPolicy.IGNORE
if dropna
else plc.lists.ConcatenateNullPolicy.NULLIFY_OUTPUT_ROW,
)
)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def to_polars(self) -> pl.DataFrame:
# To guarantee we produce correct names, we therefore
# serialise with names we control and rename with that map.
name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
table: pa.Table = plc.interop.to_arrow(
table = plc.interop.to_arrow(
self.table,
[plc.interop.ColumnMetadata(name=name) for name in name_map],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@

class TemporalFunction(Expr):
__slots__ = ("name", "options")
_COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
_COMPONENT_MAP: ClassVar[
dict[pl_expr.TemporalFunction, plc.datetime.DatetimeComponent]
] = {
vyasr marked this conversation as resolved.
Show resolved Hide resolved
pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
class LiteralColumn(Expr):
__slots__ = ("value",)
_non_child = ("dtype", "value")
value: pa.Array[Any, Any]
value: pa.Array[Any]
vyasr marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
self.dtype = dtype
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def do_evaluate(
# Mask must have been applied.
return df
elif typ == "ndjson":
json_schema: list[tuple[str, str, list]] = [
json_schema: list[plc.io.json.NameAndType] = [
vyasr marked this conversation as resolved.
Show resolved Hide resolved
(name, typ, []) for name, typ in schema.items()
]
plc_tbl_w_meta = plc.io.json.read_json(
Expand Down
Loading
Loading