Skip to content

Commit

Permalink
Remove cudf._lib.interop in favor of inlining pylibcudf (#17555)
Browse files Browse the repository at this point in the history
Contributes to #17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17555
  • Loading branch information
mroeschke authored Dec 17, 2024
1 parent 267c7f2 commit b9760ac
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 165 deletions.
9 changes: 1 addition & 8 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@
# the License.
# =============================================================================

set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx strings_udf.pyx types.pyx
utils.pyx
)
set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx)
set(linked_libraries cudf::cudf)

rapids_cython_create_modules(
Expand All @@ -24,8 +22,3 @@ rapids_cython_create_modules(
)

target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")

include(${rapids-cmake-dir}/export/find_package_root.cmake)
include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
target_link_libraries(interop PUBLIC nanoarrow)
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from . import (
groupby,
interop,
strings_udf,
)

Expand Down
111 changes: 0 additions & 111 deletions python/cudf/cudf/_lib/interop.pyx

This file was deleted.

48 changes: 29 additions & 19 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def dropna(self) -> Self:
else:
return self.copy()

@acquire_spill_lock()
def to_arrow(self) -> pa.Array:
"""Convert to PyArrow Array
Expand All @@ -295,9 +296,7 @@ def to_arrow(self) -> pa.Array:
4
]
"""
return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
"None"
].chunk(0)
return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0)

@classmethod
def from_arrow(cls, array: pa.Array) -> ColumnBase:
Expand Down Expand Up @@ -334,26 +333,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:

if isinstance(array.type, pa.DictionaryType):
indices_table = pa.table(
{
"None": pa.chunked_array(
[chunk.indices for chunk in data["None"].chunks],
[
pa.chunked_array(
[chunk.indices for chunk in data.column(0).chunks],
type=array.type.index_type,
)
}
],
[None],
)
dictionaries_table = pa.table(
{
"None": pa.chunked_array(
[chunk.dictionary for chunk in data["None"].chunks],
[
pa.chunked_array(
[chunk.dictionary for chunk in data.column(0).chunks],
type=array.type.value_type,
)
}
],
[None],
)

codes = libcudf.interop.from_arrow(indices_table)[0]
categories = libcudf.interop.from_arrow(dictionaries_table)[0]
with acquire_spill_lock():
codes = cls.from_pylibcudf(
plc.interop.from_arrow(indices_table).columns()[0]
)
categories = cls.from_pylibcudf(
plc.interop.from_arrow(dictionaries_table).columns()[0]
)
codes = cudf.core.column.categorical.as_unsigned_codes(
len(categories), codes
len(categories),
codes, # type: ignore[arg-type]
)
return cudf.core.column.CategoricalColumn(
data=None,
Expand All @@ -364,10 +370,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
mask=codes.base_mask,
children=(codes,),
)

result = libcudf.interop.from_arrow(data)[0]

return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
else:
result = cls.from_pylibcudf(
plc.interop.from_arrow(data).columns()[0]
)
# TODO: cudf_dtype_from_pa_type may be less necessary for some types
return result._with_type_metadata(
cudf_dtype_from_pa_type(array.type)
)

@acquire_spill_lock()
def _get_mask_as_column(self) -> ColumnBase:
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,7 @@ def to_pandas(
self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
)

def to_arrow(self):
def to_arrow(self) -> pa.Array:
return pa.compute.assume_timezone(
self._local_time.to_arrow(), str(self.dtype.tz)
)
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array):
mask=mask,
)

def to_arrow(self):
data_buf_32 = np.array(self.base_data.memoryview()).view("int32")
def to_arrow(self) -> pa.Array:
data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr]
data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")

# use striding to set the first 32 bits of each 128-bit chunk:
Expand Down Expand Up @@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array):
result.dtype.precision = data.type.precision
return result

def to_arrow(self):
def to_arrow(self) -> pa.Array:
return super().to_arrow().cast(self.dtype.to_arrow())

def _with_type_metadata(
Expand Down Expand Up @@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array):
mask=mask,
)

def to_arrow(self):
data_buf_64 = np.array(self.base_data.memoryview()).view("int64")
def to_arrow(self) -> pa.Array:
data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr]
data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")

# use striding to set the first 64 bits of each 128-bit chunk:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn:
"""
return cast(NumericalColumn, self.children[0])

def to_arrow(self):
def to_arrow(self) -> pa.Array:
offsets = self.offsets.to_arrow()
elements = (
pa.nulls(len(self.elements))
Expand All @@ -160,7 +160,7 @@ def to_arrow(self):
pa_type = pa.list_(elements.type)

if self.nullable:
nbuf = pa.py_buffer(self.mask.memoryview())
nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr]
buffers = (nbuf, offsets.buffers()[1])
else:
buffers = offsets.buffers()
Expand Down
15 changes: 8 additions & 7 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,16 +946,17 @@ def from_arrow(cls, data: pa.Table) -> Self:
if len(dict_indices):
dict_indices_table = pa.table(dict_indices)
data = data.drop(dict_indices_table.column_names)
indices_columns = libcudf.interop.from_arrow(dict_indices_table)
plc_indices = plc.interop.from_arrow(dict_indices_table)
# as dictionary size can vary, it can't be a single table
cudf_dictionaries_columns = {
name: ColumnBase.from_arrow(dict_dictionaries[name])
for name in dict_dictionaries.keys()
}

for name, codes in zip(
dict_indices_table.column_names, indices_columns
for name, plc_codes in zip(
dict_indices_table.column_names, plc_indices.columns()
):
codes = libcudf.column.Column.from_pylibcudf(plc_codes)
categories = cudf_dictionaries_columns[name]
codes = as_unsigned_codes(len(categories), codes)
cudf_category_frame[name] = CategoricalColumn(
Expand All @@ -971,9 +972,9 @@ def from_arrow(cls, data: pa.Table) -> Self:

# Handle non-dict arrays
cudf_non_category_frame = {
name: col
for name, col in zip(
data.column_names, libcudf.interop.from_arrow(data)
name: libcudf.column.Column.from_pylibcudf(plc_col)
for name, plc_col in zip(
data.column_names, plc.interop.from_arrow(data).columns()
)
}

Expand Down Expand Up @@ -1032,7 +1033,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
return cls._from_data({name: result[name] for name in column_names})

@_performance_tracking
def to_arrow(self):
def to_arrow(self) -> pa.Table:
"""
Convert to arrow Table
Expand Down
27 changes: 16 additions & 11 deletions python/cudf/cudf/io/dlpack.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
from __future__ import annotations

import pylibcudf as plc

import cudf
from cudf._lib import interop as libdlpack
from cudf.core.column import ColumnBase
from cudf.utils import ioutils


def from_dlpack(pycapsule_obj):
def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
"""Converts from a DLPack tensor to a cuDF object.
DLPack is an open-source memory tensor structure:
Expand All @@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj):
cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
tensor is row-major, transpose it before passing it to this function.
"""
plc_table = plc.interop.from_dlpack(pycapsule_obj)
data = dict(
enumerate(
(ColumnBase.from_pylibcudf(col) for col in plc_table.columns())
)
)

columns = libdlpack.from_dlpack(pycapsule_obj)
data = dict(enumerate(columns))

if len(columns) == 1:
if len(data) == 1:
return cudf.Series._from_data(data)
else:
return cudf.DataFrame._from_data(data)


@ioutils.doc_to_dlpack()
def to_dlpack(cudf_obj):
def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
"""Converts a cuDF object to a DLPack tensor.
DLPack is an open-source memory tensor structure:
Expand Down Expand Up @@ -80,13 +84,14 @@ def to_dlpack(cudf_obj):

if any(
not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
for _, dtype in gdf._dtypes
for _, dtype in gdf._dtypes # type: ignore[union-attr]
):
raise TypeError("non-numeric data not yet supported")

dtype = cudf.utils.dtypes.find_common_type(
[dtype for _, dtype in gdf._dtypes]
[dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr]
)
gdf = gdf.astype(dtype)

return libdlpack.to_dlpack([*gdf._columns])
return plc.interop.to_dlpack(
plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns])
)

0 comments on commit b9760ac

Please sign in to comment.