From 24f1e4d06ee5cd57e2dccb8e35516cb1c5d04d1c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 8 Oct 2024 13:46:21 +0200 Subject: [PATCH 01/16] pylibcudf: pack and unpack bindings --- python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 2 + python/pylibcudf/pylibcudf/__init__.py | 2 + .../pylibcudf/pylibcudf/contiguous_split.pxd | 17 ++++++++ .../pylibcudf/pylibcudf/contiguous_split.pyx | 40 +++++++++++++++++++ .../pylibcudf/tests/test_contiguous_split.py | 23 +++++++++++ 6 files changed, 85 insertions(+) create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pxd create mode 100644 python/pylibcudf/pylibcudf/contiguous_split.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_contiguous_split.py diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index 1d72eacac12..49e0b7ad647 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -17,6 +17,7 @@ set(cython_sources binaryop.pyx column.pyx column_factories.pyx + contiguous_split.pyx concatenate.pyx copying.pyx datetime.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index b98b37fe0fd..dedfa95ecfe 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -6,6 +6,7 @@ from . cimport ( binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, @@ -51,6 +52,7 @@ __all__ = [ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 304f27be340..c9c386889b9 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -15,6 +15,7 @@ binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, @@ -62,6 +63,7 @@ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd new file mode 100644 index 00000000000..27453126ca8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.libcudf.contiguous_split cimport packed_columns + +from .table cimport Table + + +cdef class PackedColumns: + cdef unique_ptr[packed_columns] c_obj + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data) + +cpdef PackedColumns pack(Table input) + +cpdef Table unpack(PackedColumns input) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx new file mode 100644 index 00000000000..78d5cbc6016 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.contiguous_split cimport ( + pack as cpp_pack, + packed_columns, + unpack as cpp_unpack, +) +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view + +from .table cimport Table + + +cdef class PackedColumns: + def __init__(self): + raise ValueError( + "PackedColumns should not be constructed directly. " + "Use one of the factories." + ) + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): + """Create a Python PackedColumns from a libcudf packed_columns.""" + cdef PackedColumns out = PackedColumns.__new__(PackedColumns) + out.c_obj = move(data) + return out + +cpdef PackedColumns pack(Table input): + return PackedColumns.from_libcudf( + make_unique[packed_columns](cpp_pack(input.view())) + ) + + +cpdef Table unpack(PackedColumns input): + cdef table_view v = cpp_unpack(dereference(input.c_obj)) + cdef unique_ptr[table] t = make_unique[table](v) + return Table.from_libcudf(move(t)) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py new file mode 100644 index 00000000000..f5746dea640 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.mark.parametrize( + "arrow_tbl", + [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), + ], +) +def test_pack_and_unpack(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + res = plc.contiguous_split.unpack(packed) + assert_table_eq(arrow_tbl, res) From 1b98db842f47ed00769dddf3fb84c33cc2ce87c9 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 8 Oct 2024 13:56:48 +0200 Subject: [PATCH 02/16] doc --- .../pylibcudf/pylibcudf/contiguous_split.pyx | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 78d5cbc6016..41bc4fa4936 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -15,6 +15,14 @@ from .table cimport Table cdef class PackedColumns: + """Column data in a serialized format. + + Contains data from an array of columns in two contiguous buffers: + one on host, which contains table metadata and one on device which + contains the table data. + + For details, see :cpp:class:`cudf::packed_columns`. + """ def __init__(self): raise ValueError( "PackedColumns should not be constructed directly. " @@ -29,12 +37,44 @@ cdef class PackedColumns: return out cpdef PackedColumns pack(Table input): + """Deep-copy a table into a serialized contiguous memory format. + + For details, see :cpp:func:`cudf::pack`. + + Parameters + ---------- + input : Table + Table to pack. + + Returns + ------- + PackedColumns + The packed columns. + """ return PackedColumns.from_libcudf( make_unique[packed_columns](cpp_pack(input.view())) ) cpdef Table unpack(PackedColumns input): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + input : PackedColumns + The packed columns to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ cdef table_view v = cpp_unpack(dereference(input.c_obj)) cdef unique_ptr[table] t = make_unique[table](v) return Table.from_libcudf(move(t)) From e7140fb357d57090f54373d4d6e01fbf4cec1a08 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 9 Oct 2024 10:06:35 +0200 Subject: [PATCH 03/16] property: metadata and gpu_data --- .../pylibcudf/pylibcudf/contiguous_split.pyx | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 41bc4fa4936..779c085a577 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference +from libc.stdint cimport uint8_t, uintptr_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.contiguous_split cimport ( @@ -11,8 +12,13 @@ from pylibcudf.libcudf.contiguous_split cimport ( from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from .gpumemoryview cimport gpumemoryview from .table cimport Table +from types import SimpleNamespace + +import numpy as np + cdef class PackedColumns: """Column data in a serialized format. @@ -36,6 +42,47 @@ cdef class PackedColumns: out.c_obj = move(data) return out + @property + def metadata(self): + """memoryview of the metadata (host memory)""" + cdef size_t size = dereference(dereference(self.c_obj).metadata).size() + cdef uint8_t* data = dereference(dereference(self.c_obj).metadata).data() + if size == 0: + return memoryview(np.ndarray(shape=(0,), dtype="uint8")) + return memoryview( + np.asarray( + SimpleNamespace( + owner = self, + __array_interface__ = { + 'data': (data, False), + 'shape': (size,), + 'typestr': '|u1', + 'strides': None, + 'version': 3, + } + ) + ) + ) + + @property + def gpu_data(self): + """gpumemoryview of the data (device memory)""" + cdef size_t size = dereference(dereference(self.c_obj).gpu_data).size() + cdef void* data = dereference(dereference(self.c_obj).gpu_data).data() + return gpumemoryview( + SimpleNamespace( + owner = self, + __cuda_array_interface__ = { + 'data': (data, False), + 'shape': (size,), + 'typestr': '|u1', + 'strides': None, + 'version': 3, + } + ) + ) + + cpdef PackedColumns pack(Table input): """Deep-copy a table into a serialized contiguous memory format. From 9d5491b667e9ed96b19fc38c4014f995f064fff5 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 9 Oct 2024 13:02:45 +0200 Subject: [PATCH 04/16] unpack_from_memoryviews --- .../pylibcudf/pylibcudf/contiguous_split.pxd | 3 ++ .../pylibcudf/pylibcudf/contiguous_split.pyx | 39 ++++++++++++++++++- .../pylibcudf/libcudf/contiguous_split.pxd | 5 +++ .../pylibcudf/tests/test_contiguous_split.py | 9 +++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd index 27453126ca8..2a10cb5b3d5 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd @@ -3,6 +3,7 @@ from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.contiguous_split cimport packed_columns +from .gpumemoryview cimport gpumemoryview from .table cimport Table @@ -15,3 +16,5 @@ cdef class PackedColumns: cpdef PackedColumns pack(Table input) cpdef Table unpack(PackedColumns input) + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 779c085a577..767f9a7490a 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -14,6 +14,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from .gpumemoryview cimport gpumemoryview from .table cimport Table +from .utils cimport int_to_void_ptr from types import SimpleNamespace @@ -123,5 +124,41 @@ cpdef Table unpack(PackedColumns input): Copy of the packed columns. """ cdef table_view v = cpp_unpack(dereference(input.c_obj)) - cdef unique_ptr[table] t = make_unique[table](v) + cdef unique_ptr[table] t = make_unique[table](v) # Copy + return Table.from_libcudf(move(t)) + + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + metadata : memoryview + The packed metadata to unpack. + gpu_data : gpumemoryview + The packed gpu_data to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ + if metadata.nbytes == 0: + if gpu_data.__cuda_array_interface__["data"][0] != 0: + raise ValueError("expect an empty gpu_data when unpackking an empty table") + return Table.from_libcudf(make_unique[table](table_view())) + + # Extract the raw data pointers + cdef const uint8_t[::1] _metadata = metadata + cdef const uint8_t* metadata_ptr = &_metadata[0] + cdef const uint8_t* gpu_data_ptr = int_to_void_ptr(gpu_data.ptr) + + cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr) + cdef unique_ptr[table] t = make_unique[table](v) # Copy return Table.from_libcudf(move(t)) diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd index cadac6a0022..e19e7fff334 100644 --- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd @@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: cdef packed_columns pack (const table_view& input) except + cdef table_view unpack (const packed_columns& input) except + + + cdef table_view unpack ( + const uint8_t* metadata, + const uint8_t* gpu_data + ) except + diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index f5746dea640..864189de093 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import cupy import pyarrow as pa import pylibcudf as plc import pytest @@ -19,5 +20,13 @@ def test_pack_and_unpack(arrow_tbl): plc_tbl = plc.interop.from_arrow(arrow_tbl) packed = plc.contiguous_split.pack(plc_tbl) + res = plc.contiguous_split.unpack(packed) assert_table_eq(arrow_tbl, res) + + # Copy the buffers to simulate IO + metadata = memoryview(bytes(packed.metadata)) + gpu_data = plc.gpumemoryview(cupy.array(packed.gpu_data, copy=True)) + + res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpu_data) + assert_table_eq(arrow_tbl, res) From e700db64843de0eb003f92b48e0071bf0790edee Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 08:37:14 +0200 Subject: [PATCH 05/16] Apply suggestions from code review Co-authored-by: Bradley Dice --- python/pylibcudf/pylibcudf/contiguous_split.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 767f9a7490a..b92ff208c30 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -151,7 +151,7 @@ cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) """ if metadata.nbytes == 0: if gpu_data.__cuda_array_interface__["data"][0] != 0: - raise ValueError("expect an empty gpu_data when unpackking an empty table") + raise ValueError("Expected an empty gpu_data from unpacking an empty table") return Table.from_libcudf(make_unique[table](table_view())) # Extract the raw data pointers From 0e2468543dd7e39cd851bb48b52a1aeed8239e25 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 11:38:07 +0200 Subject: [PATCH 06/16] doc --- python/pylibcudf/pylibcudf/contiguous_split.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index b92ff208c30..ca6bf8e9cf6 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -124,6 +124,8 @@ cpdef Table unpack(PackedColumns input): Copy of the packed columns. """ cdef table_view v = cpp_unpack(dereference(input.c_obj)) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . cdef unique_ptr[table] t = make_unique[table](v) # Copy return Table.from_libcudf(move(t)) @@ -160,5 +162,7 @@ cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) cdef const uint8_t* gpu_data_ptr = int_to_void_ptr(gpu_data.ptr) cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . cdef unique_ptr[table] t = make_unique[table](v) # Copy return Table.from_libcudf(move(t)) From 96d753ce3f86400fd87087affb1edf505efdbd1b Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 11:39:44 +0200 Subject: [PATCH 07/16] cleanup --- python/pylibcudf/pylibcudf/contiguous_split.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index ca6bf8e9cf6..652b12fc09e 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -126,7 +126,7 @@ cpdef Table unpack(PackedColumns input): cdef table_view v = cpp_unpack(dereference(input.c_obj)) # Since `Table.from_table_view` doesn't support an arbitrary owning object, # we copy the table, see . - cdef unique_ptr[table] t = make_unique[table](v) # Copy + cdef unique_ptr[table] t = make_unique[table](v) return Table.from_libcudf(move(t)) @@ -164,5 +164,5 @@ cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr) # Since `Table.from_table_view` doesn't support an arbitrary owning object, # we copy the table, see . - cdef unique_ptr[table] t = make_unique[table](v) # Copy + cdef unique_ptr[table] t = make_unique[table](v) return Table.from_libcudf(move(t)) From 03e942da8b7ba0051e726f024c05f70abab22658 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 15:02:57 +0200 Subject: [PATCH 08/16] avoid using cupy in test --- .../pylibcudf/pylibcudf/tests/test_contiguous_split.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 864189de093..85f6f90c498 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import cupy import pyarrow as pa import pylibcudf as plc import pytest @@ -24,9 +23,7 @@ def test_pack_and_unpack(arrow_tbl): res = plc.contiguous_split.unpack(packed) assert_table_eq(arrow_tbl, res) - # Copy the buffers to simulate IO - metadata = memoryview(bytes(packed.metadata)) - gpu_data = plc.gpumemoryview(cupy.array(packed.gpu_data, copy=True)) - - res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpu_data) + res = plc.contiguous_split.unpack_from_memoryviews( + packed.metadata, packed.gpu_data + ) assert_table_eq(arrow_tbl, res) From 7436a1b752a6ee1b867ca3afe97af62614f7382a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 22:19:29 +0200 Subject: [PATCH 09/16] PackedColumns.release() --- .../pylibcudf/pylibcudf/contiguous_split.pyx | 98 +++++++++++-------- .../pylibcudf/tests/test_contiguous_split.py | 36 ++++--- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 652b12fc09e..5033239b055 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference -from libc.stdint cimport uint8_t, uintptr_t +from libc.stdint cimport uint8_t from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move +from libcpp.vector cimport vector from pylibcudf.libcudf.contiguous_split cimport ( pack as cpp_pack, packed_columns, @@ -12,14 +13,46 @@ from pylibcudf.libcudf.contiguous_split cimport ( from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.pylibrmm.device_buffer cimport DeviceBuffer + from .gpumemoryview cimport gpumemoryview from .table cimport Table from .utils cimport int_to_void_ptr -from types import SimpleNamespace -import numpy as np +cdef class HostBuffer: + """Owning host buffer that implements the buffer protocol""" + cdef unique_ptr[vector[uint8_t]] c_obj + cdef size_t nbytes + cdef Py_ssize_t[1] shape + cdef Py_ssize_t[1] strides + + @staticmethod + cdef HostBuffer from_unique_ptr( + unique_ptr[vector[uint8_t]] vec + ): + cdef HostBuffer out = HostBuffer() + out.c_obj = move(vec) + out.nbytes = dereference(out.c_obj).size() + out.shape[0] = out.nbytes + out.strides[0] = 1 + return out + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = dereference(self.c_obj).data() + buffer.format = NULL # byte + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self.nbytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass cdef class PackedColumns: """Column data in a serialized format. @@ -43,43 +76,28 @@ cdef class PackedColumns: out.c_obj = move(data) return out - @property - def metadata(self): - """memoryview of the metadata (host memory)""" - cdef size_t size = dereference(dereference(self.c_obj).metadata).size() - cdef uint8_t* data = dereference(dereference(self.c_obj).metadata).data() - if size == 0: - return memoryview(np.ndarray(shape=(0,), dtype="uint8")) - return memoryview( - np.asarray( - SimpleNamespace( - owner = self, - __array_interface__ = { - 'data': (data, False), - 'shape': (size,), - 'typestr': '|u1', - 'strides': None, - 'version': 3, - } - ) - ) - ) - - @property - def gpu_data(self): - """gpumemoryview of the data (device memory)""" - cdef size_t size = dereference(dereference(self.c_obj).gpu_data).size() - cdef void* data = dereference(dereference(self.c_obj).gpu_data).data() - return gpumemoryview( - SimpleNamespace( - owner = self, - __cuda_array_interface__ = { - 'data': (data, False), - 'shape': (size,), - 'typestr': '|u1', - 'strides': None, - 'version': 3, - } + def release(self): + """Release the metadata (host memory) and the gpu_data (device) memory + + The ownership of the data are transferred to the returned buffers. After + this call, `self` is empty. + + Returns + ------- + memoryview + The metadata. + gpumemoryview + The gpu data. + """ + if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data): + raise ValueError("Cannot release empty PackedColumns") + + return ( + memoryview( + HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata)) + ), + gpumemoryview( + DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data)) ) ) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 85f6f90c498..2c06e86b531 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -5,17 +5,16 @@ import pytest from utils import assert_table_eq +mixed_pyarrow_tables = [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), +] -@pytest.mark.parametrize( - "arrow_tbl", - [ - pa.table([]), - pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), - pa.table({"a": [1, 2, 3]}), - pa.table({"a": [1], "b": [2], "c": [3]}), - pa.table({"a": ["a", "bb", "ccc"]}), - ], -) + +@pytest.mark.parametrize("arrow_tbl", mixed_pyarrow_tables) def test_pack_and_unpack(arrow_tbl): plc_tbl = plc.interop.from_arrow(arrow_tbl) packed = plc.contiguous_split.pack(plc_tbl) @@ -23,7 +22,18 @@ def test_pack_and_unpack(arrow_tbl): res = plc.contiguous_split.unpack(packed) assert_table_eq(arrow_tbl, res) - res = plc.contiguous_split.unpack_from_memoryviews( - packed.metadata, packed.gpu_data - ) + +@pytest.mark.parametrize("arrow_tbl", mixed_pyarrow_tables) +def test_pack_and_unpack_from_memoryviews(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + + metadata, gpudata = packed.release() + + with pytest.raises(ValueError, match="Cannot release empty"): + packed.release() + + del packed # `metadata` and `gpudata` should survive + + res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata) assert_table_eq(arrow_tbl, res) From 5b254a049b96ddf20a69ce1d7a2323f88744a738 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 22:30:34 +0200 Subject: [PATCH 10/16] doc --- python/pylibcudf/pylibcudf/contiguous_split.pyx | 7 +++++++ python/pylibcudf/pylibcudf/tests/test_contiguous_split.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 5033239b055..95bd86f4bc3 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -82,6 +82,13 @@ cdef class PackedColumns: The ownership of the data are transferred to the returned buffers. After this call, `self` is empty. + Examples + -------- + The two buffers can be unpacked using `unpack_from_memoryviews`: + + >>> packed = pylibcudf.contiguous_split.pack(...) + >>> pylibcudf.contiguous_split.unpack_from_memoryviews(*packed.release()) + Returns ------- memoryview diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 2c06e86b531..4299f13ed24 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -33,7 +33,7 @@ def test_pack_and_unpack_from_memoryviews(arrow_tbl): with pytest.raises(ValueError, match="Cannot release empty"): packed.release() - del packed # `metadata` and `gpudata` should survive + del packed # `metadata` and `gpudata` will survive res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata) assert_table_eq(arrow_tbl, res) From b46d667cb6fdddc7fdbfae7b2def18ea93f34218 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 10 Oct 2024 22:32:44 +0200 Subject: [PATCH 11/16] doc --- python/pylibcudf/pylibcudf/contiguous_split.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 95bd86f4bc3..54bacd769e2 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -77,7 +77,7 @@ cdef class PackedColumns: return out def release(self): - """Release the metadata (host memory) and the gpu_data (device) memory + """Returns the metadata and gpu data, and releases the ownership. The ownership of the data are transferred to the returned buffers. After this call, `self` is empty. @@ -92,9 +92,9 @@ cdef class PackedColumns: Returns ------- memoryview - The metadata. + The metadata (host memory) gpumemoryview - The gpu data. + The gpu data (device memory) """ if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data): raise ValueError("Cannot release empty PackedColumns") From 0563e933e2202cb52097a9165498d371eed78168 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 11 Oct 2024 08:23:05 +0200 Subject: [PATCH 12/16] doc --- .../pylibcudf/pylibcudf/contiguous_split.pyx | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx index 54bacd769e2..ed926a3fcc0 100644 --- a/python/pylibcudf/pylibcudf/contiguous_split.pyx +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -77,24 +77,17 @@ cdef class PackedColumns: return out def release(self): - """Returns the metadata and gpu data, and releases the ownership. + """Releases and returns the underlying serialized metadata and gpu data. - The ownership of the data are transferred to the returned buffers. After + The ownership of the memory are transferred to the returned buffers. After this call, `self` is empty. - Examples - -------- - The two buffers can be unpacked using `unpack_from_memoryviews`: - - >>> packed = pylibcudf.contiguous_split.pack(...) - >>> pylibcudf.contiguous_split.unpack_from_memoryviews(*packed.release()) - Returns ------- - memoryview - The metadata (host memory) - gpumemoryview - The gpu data (device memory) + memoryview (of a HostBuffer) + The serialized metadata as contiguous host memory. + gpumemoryview (of a rmm.DeviceBuffer) + The serialized gpu data as contiguous device memory. """ if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data): raise ValueError("Cannot release empty PackedColumns") @@ -112,6 +105,18 @@ cdef class PackedColumns: cpdef PackedColumns pack(Table input): """Deep-copy a table into a serialized contiguous memory format. + Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized + data back into the table. + + Examples + -------- + >>> packed = pylibcudf.contiguous_split.pack(...) + >>> # Either unpack the whole `PackedColumns` at once. + >>> pylibcudf.contiguous_split.unpack(packed) + >>> # Or unpack the two serialized buffers in `PackedColumns`. + >>> metadata, gpu_data = packed.release() + >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data) + For details, see :cpp:func:`cudf::pack`. Parameters From 6e5e8b88df3427782b78165e41014438cfe9dc79 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 11 Oct 2024 10:34:19 +0200 Subject: [PATCH 13/16] Update python/pylibcudf/pylibcudf/tests/test_contiguous_split.py Co-authored-by: Lawrence Mitchell --- python/pylibcudf/pylibcudf/tests/test_contiguous_split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 4299f13ed24..677237b93c1 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -11,6 +11,8 @@ pa.table({"a": [1, 2, 3]}), pa.table({"a": [1], "b": [2], "c": [3]}), pa.table({"a": ["a", "bb", "ccc"]}), + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}) + pa.table({"a": [["a", "b"], ["cde"]], "b": [{"alpha": [1, 2], "beta": None}]}) ] From 7b07c10efdf9fc83f8171985388a79e3234a3528 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 11 Oct 2024 10:46:34 +0200 Subject: [PATCH 14/16] fix mixed_pyarrow_tables --- .../pylibcudf/tests/test_contiguous_split.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 677237b93c1..40d7e353c6c 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -11,8 +11,16 @@ pa.table({"a": [1, 2, 3]}), pa.table({"a": [1], "b": [2], "c": [3]}), pa.table({"a": ["a", "bb", "ccc"]}), - pa.table({"a": [1, 2, None], "b": [None, 3, 4]}) - pa.table({"a": [["a", "b"], ["cde"]], "b": [{"alpha": [1, 2], "beta": None}]}) + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}), + pa.table( + { + "a": [["a", "b"], ["cde"]], + "b": [ + {"alpha": [1, 2], "beta": None}, + {"alpha": [3, 4], "beta": 5}, + ], + } + ), ] From 6bc388c05e1e46315ace6079d377496cde2da2e3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Fri, 11 Oct 2024 10:50:57 +0200 Subject: [PATCH 15/16] xfail https://github.com/rapidsai/cudf/issues/17061 --- .../pylibcudf/tests/test_contiguous_split.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 40d7e353c6c..74961011277 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -5,7 +5,7 @@ import pytest from utils import assert_table_eq -mixed_pyarrow_tables = [ +param_pyarrow_tables = [ pa.table([]), pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), pa.table({"a": [1, 2, 3]}), @@ -21,10 +21,16 @@ ], } ), + pytest.param( + pa.array([{"a": [1, 2], "b": [3, 4]}]), + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/17061" + ), + ), ] -@pytest.mark.parametrize("arrow_tbl", mixed_pyarrow_tables) +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) def test_pack_and_unpack(arrow_tbl): plc_tbl = plc.interop.from_arrow(arrow_tbl) packed = plc.contiguous_split.pack(plc_tbl) @@ -33,7 +39,7 @@ def test_pack_and_unpack(arrow_tbl): assert_table_eq(arrow_tbl, res) -@pytest.mark.parametrize("arrow_tbl", mixed_pyarrow_tables) +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) def test_pack_and_unpack_from_memoryviews(arrow_tbl): plc_tbl = plc.interop.from_arrow(arrow_tbl) packed = plc.contiguous_split.pack(plc_tbl) From a29d54113a082f926ef26f9e6fdf1f3592a1f141 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 11 Oct 2024 11:49:28 +0000 Subject: [PATCH 16/16] Remove test case We can't contiguous-split a Column. --- python/pylibcudf/pylibcudf/tests/test_contiguous_split.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py index 74961011277..7a5c1664eed 100644 --- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -21,12 +21,6 @@ ], } ), - pytest.param( - pa.array([{"a": [1, 2], "b": [3, 4]}]), - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/17061" - ), - ), ]