Skip to content

Commit

Permalink
add unit tests and modify signatures
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Sep 10, 2024
1 parent a3a0f65 commit 66fad1f
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 126 deletions.
11 changes: 6 additions & 5 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,6 @@ cdef class Column:
children=tuple(children)
)

# TODO: Actually support exposed data pointers.
@staticmethod
def from_pylibcudf(
col, bint data_ptr_exposed=False
Expand All @@ -616,7 +615,7 @@ cdef class Column:
col : pylibcudf.Column
The object to copy.
data_ptr_exposed : bool
This parameter is not yet supported
Whether the data buffer is exposed.
Returns
-------
Expand All @@ -639,16 +638,18 @@ cdef class Column:
dtype = dtype_from_pylibcudf_column(col)

return cudf.core.column.build_column(
data=as_buffer(col.data().obj) if col.data() is not None else None,
data=as_buffer(
col.data().obj, exposed=data_ptr_exposed
) if col.data() is not None else None,
dtype=dtype,
size=col.size(),
mask=as_buffer(
col.null_mask().obj
col.null_mask().obj, exposed=data_ptr_exposed
) if col.null_mask() is not None else None,
offset=col.offset(),
null_count=col.null_count(),
children=tuple([
Column.from_pylibcudf(child)
Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed)
for child in col.children()
])
)
Expand Down
109 changes: 28 additions & 81 deletions python/cudf/cudf/_lib/transform.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,10 @@ def bools_to_mask(Column col):
Given an int8 (boolean) column, compress the data from booleans to bits and
return a Buffer
"""
cdef column_view col_view = col.view()
cdef pair[unique_ptr[device_buffer], size_type] cpp_out
cdef unique_ptr[device_buffer] up_db

with nogil:
cpp_out = move(libcudf_transform.bools_to_mask(col_view))
up_db = move(cpp_out.first)

rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
buf = as_buffer(rmm_db)
return buf
mask, _ = plc_transform.bools_to_mask(
input.to_pylibcudf(mode="read")
)
return as_buffer(mask)


@acquire_spill_lock()
Expand All @@ -68,103 +61,57 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
raise TypeError("mask_buffer is not an instance of "
"cudf.core.buffer.Buffer")
cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(
mask_buffer.get_ptr(mode="read")
plc_columns = plc_transform.mask_to_bools(
mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
)

cdef unique_ptr[column] result
with nogil:
result = move(
libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit)
)

return Column.from_unique_ptr(move(result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
def nans_to_nulls(Column input):
(mask, _) = plc_transform.nans_to_nulls(
mask, _ = plc_transform.nans_to_nulls(
input.to_pylibcudf(mode="read")
)
return as_buffer(mask)


@acquire_spill_lock()
def transform(Column input, op):
cdef column_view c_input = input.view()
cdef string c_str
cdef type_id c_tid
cdef data_type c_dtype

nb_type = numpy_support.from_dtype(input.dtype)
nb_signature = (nb_type,)
compiled_op = cudautils.compile_udf(op, nb_signature)
c_str = compiled_op[0].encode('UTF-8')
np_dtype = cudf.dtype(compiled_op[1])

try:
c_tid = <type_id> (
<underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
np_dtype
]
)
c_dtype = data_type(c_tid)

except KeyError:
raise TypeError(
"Result of window function has unsupported dtype {}"
.format(np_dtype)
)

with nogil:
c_output = move(libcudf_transform.transform(
c_input,
c_str,
c_dtype,
True
))

return Column.from_unique_ptr(move(c_output))
plc_column = plc_transform.transform(
input.to_pylibcudf(mode="read"),
compiled_op[0],
plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
True
)
return Column.from_pylibcudf(plc_column)


def table_encode(list source_columns):
cdef table_view c_input = table_view_from_columns(source_columns)
cdef pair[unique_ptr[table], unique_ptr[column]] c_result

with nogil:
c_result = move(libcudf_transform.encode(c_input))
plc_table, plc_column = plc_transform.transform(
plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
)

return (
columns_from_unique_ptr(move(c_result.first)),
Column.from_unique_ptr(move(c_result.second))
[Column.from_pylibcudf(col) for col in plc_table.columns()]
Column.from_pylibcudf(plc_column)
)


def one_hot_encode(Column input_column, Column categories):
cdef column_view c_view_input = input_column.view()
cdef column_view c_view_categories = categories.view()
cdef pair[unique_ptr[column], table_view] c_result

with nogil:
c_result = move(
libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
)

# Notice, the data pointer of `owner` has been exposed
# through `c_result.second` at this point.
owner = Column.from_unique_ptr(
move(c_result.first), data_ptr_exposed=True
)

pylist_categories = categories.to_arrow().to_pylist()
encodings, _ = data_from_table_view(
move(c_result.second),
owner=owner,
column_names=[
x if x is not None else '<NA>' for x in pylist_categories
]
plc_table = plc_transform.one_hot_encode(
input_column.to_pylibcudf(mode="read"),
categories.to_pylibcudf(mode="read"),
)
return encodings
result_columns = [
Column.from_pylibcudf(col, data_ptr_exposed=True)
for col in plc_table.columns()
]
return dict(zip(pylist_categories, result_columns)), None


@acquire_spill_lock()
Expand Down
51 changes: 51 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans):
got = input.with_mask(mask, null_count)

assert_column_eq(expect, got)


def test_bools_to_mask_roundtrip():
pa_array = pa.array([True, None, False])
plc_input = plc.interop.from_arrow(pa_array)
mask, result_null_count = plc.transform.bools_to_mask(plc_input)

assert result_null_count == 2
result = plc_input.with_mask(mask, result_null_count)
assert_column_eq(pa.array([True, None, None]), result)

plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array))
result_pa = plc.interop.to_arrow(plc_output)
expected_pa = pa.chunked_array([[True, False, False]])
assert result_pa.equals(expected_pa)


def test_encode():
pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]})
plc_input = plc.interop.from_arrow(pa_table)
result_table, result_column = plc.transform.encode(plc_input)
pa_table_result = plc.interop.to_arrow(result_table)
pa_column_result = plc.interop.to_arrow(result_column)

pa_table_expected = pa.table(
[[1, 3, 4], [1, 2, 4]],
schema=pa.schema(
[
pa.field("", pa.int64(), nullable=False),
pa.field("", pa.int64(), nullable=False),
]
),
)
assert pa_table_result.equals(pa_table_expected)

pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32())
assert pa_column_result.equals(pa_column_expected)


def test_one_hot_encode():
pa_column = pa.array([1, 2, 3])
pa_categories = pa.array([0, 0, 0])
plc_input = plc.interop.from_arrow(pa_column)
plc_categories = plc.interop.from_arrow(pa_categories)
plc_table = plc.transform.one_hot_encode(plc_input, plc_categories)
result = plc.interop.to_arrow(plc_table)
expected = pa.table(
[[False] * 3] * 3,
schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3),
)
assert result.equals(expected)
6 changes: 2 additions & 4 deletions python/pylibcudf/pylibcudf/transform.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from pylibcudf.libcudf.types cimport bitmask_type, data_type

from .column cimport Column
Expand All @@ -18,6 +18,4 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i

cpdef tuple[Table, Column] encode(Table input)

cpdef tuple[Column, Table] one_hot_encode(Column input_column, Column categories)

cpdef Column compute_column(Table table, str expr)
cpdef Table one_hot_encode(Column input_column, Column categories)
49 changes: 13 additions & 36 deletions python/pylibcudf/pylibcudf/transform.pyx
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move, pair
from pylibcudf.libcudf cimport transform as cpp_transform
from pylibcudf.libcudf.column.column cimport column
# from pylibcudf.libcudf.expressions cimport expression
from pylibcudf.libcudf.table.table cimport table
from pylibcudf.libcudf.table.table_view cimport table_view
from pylibcudf.libcudf.types cimport bitmask_type, size_type
Expand Down Expand Up @@ -112,11 +112,13 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i
The transformed column having the UDF applied to each element.
"""
cdef unique_ptr[column] c_result
cdef string c_unary_udf = unary_udf.encode()
cdef bool c_is_ptx = is_ptx

with nogil:
c_result = move(
cpp_transform.transform(
input.view(), unary_udf, output_type.c_obj, is_ptx
input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
)
)

Expand All @@ -139,14 +141,14 @@ cpdef tuple[Table, Column] encode(Table input):
cdef pair[unique_ptr[table], unique_ptr[column]] c_result

with nogil:
c_result = move(cpp_transform.encode(table.view()))
c_result = move(cpp_transform.encode(input.view()))

return (
Table.from_libcudf(move(c_result.first)),
Column.from_libcudf(move(c_result.second))
)

cpdef tuple[Column, Table] one_hot_encode(Column input, Column categories):
cpdef Table one_hot_encode(Column input, Column categories):
"""Encodes `input` by generating a new column
for each value in `categories` indicating the presence
of that value in `input`.
Expand All @@ -160,42 +162,17 @@ cpdef tuple[Column, Table] one_hot_encode(Column input, Column categories):
Returns
-------
tuple[Column, Table]
A two-tuple containing the owner column to all the encoded data
and a table of the encoded values.
Column
A table of the encoded values.
"""
cdef pair[unique_ptr[column], table_view] c_result
cdef Table owner_table

with nogil:
c_result = move(cpp_transpose.one_hot_encode(input.view(), categories.view()))

owner_column = Column.from_libcudf(move(c_result.first))
owner_table = Table([owner_column] * c_result.second.num_columns())
c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))

return (
owner_column,
Table.from_table_view(c_result.second, owner_table)
owner_table = Table(
[Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
)

cpdef Column compute_column(Table input_table, str expr):
"""Compute a new column by evaluating an expression tree on a table.
Parameters
----------
input_table : Table
The table used for expression evaluation.
expr : Column
The root of the expression tree.
Returns
-------
Column
Resulting column.
"""
pass
# cdef unique_ptr[column] c_result

# with nogil:
# c_result = move(cpp_transform.compute_column(input_table.view(), expr))

# return Column.from_libcudf(move(c_result))
return Table.from_table_view(c_result.second, owner_table)

0 comments on commit 66fad1f

Please sign in to comment.