Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pylibcudf: pack and unpack #17012

Merged
merged 19 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ set(cython_sources
binaryop.pyx
column.pyx
column_factories.pyx
contiguous_split.pyx
concatenate.pyx
copying.pyx
datetime.pyx
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from . cimport (
binaryop,
column_factories,
concatenate,
contiguous_split,
copying,
datetime,
experimental,
Expand Down Expand Up @@ -51,6 +52,7 @@ __all__ = [
"aggregation",
"binaryop",
"column_factories",
"contiguous_split",
"concatenate",
"copying",
"datetime",
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
binaryop,
column_factories,
concatenate,
contiguous_split,
copying,
datetime,
experimental,
Expand Down Expand Up @@ -62,6 +63,7 @@
"aggregation",
"binaryop",
"column_factories",
"contiguous_split",
"concatenate",
"copying",
"datetime",
Expand Down
20 changes: 20 additions & 0 deletions python/pylibcudf/pylibcudf/contiguous_split.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from pylibcudf.libcudf.contiguous_split cimport packed_columns

from .gpumemoryview cimport gpumemoryview
from .table cimport Table


cdef class PackedColumns:
cdef unique_ptr[packed_columns] c_obj

@staticmethod
cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data)

cpdef PackedColumns pack(Table input)

cpdef Table unpack(PackedColumns input)

cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data)
164 changes: 164 additions & 0 deletions python/pylibcudf/pylibcudf/contiguous_split.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint8_t, uintptr_t
from libcpp.memory cimport make_unique, unique_ptr
from libcpp.utility cimport move
from pylibcudf.libcudf.contiguous_split cimport (
pack as cpp_pack,
packed_columns,
unpack as cpp_unpack,
)
from pylibcudf.libcudf.table.table cimport table
from pylibcudf.libcudf.table.table_view cimport table_view

from .gpumemoryview cimport gpumemoryview
from .table cimport Table
from .utils cimport int_to_void_ptr

from types import SimpleNamespace

import numpy as np
mroeschke marked this conversation as resolved.
Show resolved Hide resolved


cdef class PackedColumns:
"""Column data in a serialized format.

Contains data from an array of columns in two contiguous buffers:
one on host, which contains table metadata and one on device which
contains the table data.

For details, see :cpp:class:`cudf::packed_columns`.
"""
def __init__(self):
raise ValueError(
"PackedColumns should not be constructed directly. "
"Use one of the factories."
)

@staticmethod
cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data):
"""Create a Python PackedColumns from a libcudf packed_columns."""
cdef PackedColumns out = PackedColumns.__new__(PackedColumns)
out.c_obj = move(data)
return out

@property
def metadata(self):
"""memoryview of the metadata (host memory)"""
cdef size_t size = dereference(dereference(self.c_obj).metadata).size()
cdef uint8_t* data = dereference(dereference(self.c_obj).metadata).data()
if size == 0:
return memoryview(np.ndarray(shape=(0,), dtype="uint8"))
return memoryview(
np.asarray(
SimpleNamespace(
owner = self,
__array_interface__ = {
'data': (<uintptr_t>data, False),
'shape': (size,),
'typestr': '|u1',
'strides': None,
'version': 3,
}
)
)
)
wence- marked this conversation as resolved.
Show resolved Hide resolved

@property
def gpu_data(self):
"""gpumemoryview of the data (device memory)"""
cdef size_t size = dereference(dereference(self.c_obj).gpu_data).size()
cdef void* data = dereference(dereference(self.c_obj).gpu_data).data()
return gpumemoryview(
SimpleNamespace(
owner = self,
__cuda_array_interface__ = {
'data': (<uintptr_t>data, False),
'shape': (size,),
'typestr': '|u1',
'strides': None,
'version': 3,
}
)
)


cpdef PackedColumns pack(Table input):
"""Deep-copy a table into a serialized contiguous memory format.

For details, see :cpp:func:`cudf::pack`.

Parameters
----------
input : Table
Table to pack.

Returns
-------
PackedColumns
The packed columns.
"""
return PackedColumns.from_libcudf(
make_unique[packed_columns](cpp_pack(input.view()))
)


cpdef Table unpack(PackedColumns input):
"""Deserialize the result of `pack`.

Copies the result of a serialized table into a table.
Contrary to the libcudf C++ function, the returned table is a copy
of the serialized data.

For details, see :cpp:func:`cudf::unpack`.

Parameters
----------
input : PackedColumns
The packed columns to unpack.

Returns
-------
Table
Copy of the packed columns.
"""
cdef table_view v = cpp_unpack(dereference(input.c_obj))
cdef unique_ptr[table] t = make_unique[table](v) # Copy
madsbk marked this conversation as resolved.
Show resolved Hide resolved
return Table.from_libcudf(move(t))


cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data):
"""Deserialize the result of `pack`.

Copies the result of a serialized table into a table.
Contrary to the libcudf C++ function, the returned table is a copy
of the serialized data.

For details, see :cpp:func:`cudf::unpack`.

Parameters
----------
metadata : memoryview
The packed metadata to unpack.
gpu_data : gpumemoryview
The packed gpu_data to unpack.

Returns
-------
Table
Copy of the packed columns.
"""
if metadata.nbytes == 0:
if gpu_data.__cuda_array_interface__["data"][0] != 0:
raise ValueError("expect an empty gpu_data when unpackking an empty table")
madsbk marked this conversation as resolved.
Show resolved Hide resolved
return Table.from_libcudf(make_unique[table](table_view()))

# Extract the raw data pointers
cdef const uint8_t[::1] _metadata = metadata
cdef const uint8_t* metadata_ptr = &_metadata[0]
cdef const uint8_t* gpu_data_ptr = <uint8_t*>int_to_void_ptr(gpu_data.ptr)

cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr)
cdef unique_ptr[table] t = make_unique[table](v) # Copy
return Table.from_libcudf(move(t))
5 changes: 5 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
cdef packed_columns pack (const table_view& input) except +

cdef table_view unpack (const packed_columns& input) except +

cdef table_view unpack (
const uint8_t* metadata,
const uint8_t* gpu_data
) except +
32 changes: 32 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import cupy
import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_table_eq


@pytest.mark.parametrize(
"arrow_tbl",
[
pa.table([]),
pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
pa.table({"a": [1, 2, 3]}),
pa.table({"a": [1], "b": [2], "c": [3]}),
pa.table({"a": ["a", "bb", "ccc"]}),
],
)
def test_pack_and_unpack(arrow_tbl):
plc_tbl = plc.interop.from_arrow(arrow_tbl)
packed = plc.contiguous_split.pack(plc_tbl)

res = plc.contiguous_split.unpack(packed)
assert_table_eq(arrow_tbl, res)

# Copy the buffers to simulate IO
metadata = memoryview(bytes(packed.metadata))
gpu_data = plc.gpumemoryview(cupy.array(packed.gpu_data, copy=True))

res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpu_data)
assert_table_eq(arrow_tbl, res)
Loading