From 056fcb1e3d9a804c8741c8a011ed5c53bf31ac93 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:52:31 -0700 Subject: [PATCH 1/7] Add partitioning APIs to pylibcudf --- .../user_guide/api_docs/pylibcudf/index.rst | 1 + .../api_docs/pylibcudf/partitioning.rst | 6 + python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 2 + python/pylibcudf/pylibcudf/__init__.py | 2 + .../pylibcudf/libcudf/partitioning.pxd | 7 ++ python/pylibcudf/pylibcudf/partitioning.pxd | 19 +++ python/pylibcudf/pylibcudf/partitioning.pyx | 109 ++++++++++++++++++ 8 files changed, 147 insertions(+) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst create mode 100644 python/pylibcudf/pylibcudf/partitioning.pxd create mode 100644 python/pylibcudf/pylibcudf/partitioning.pyx diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 6a2b66e8ea0..7e4cf0356f1 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -24,6 +24,7 @@ This page provides API documentation for pylibcudf. lists merge null_mask + partitioning quantiles reduce replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst new file mode 100644 index 00000000000..6951dbecca0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst @@ -0,0 +1,6 @@ +============ +partitioning +============ + +.. automodule:: pylibcudf.partitioning + :members: diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a4f17344cb0..8241444de38 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -30,6 +30,7 @@ set(cython_sources lists.pyx merge.pyx null_mask.pyx + partitioning.pyx quantiles.pyx reduce.pyx replace.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index 841efa59bda..d59b945a652 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -16,6 +16,7 @@ from . cimport ( lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -59,6 +60,7 @@ __all__ = [ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index d3878a89a6a..8455cf5849d 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -27,6 +27,7 @@ lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -71,6 +72,7 @@ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index 1ea10e8a194..aa42e5af007 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: const column_view& partition_map, int num_partitions ) except + + + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ + round_robin_partition "cudf::round_robin_partition" ( + const table_view& input, + int num_partitions, + int start_partition=* + ) except + diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd new file mode 100644 index 00000000000..aad60149fc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +) + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions) + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=* +) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx new file mode 100644 index 00000000000..36e7a88e063 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +cimport pylibcudf.libcudf.types as libcudf_types +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.libcudf cimport partitioning as cpp_partitioning +from pylibcudf.libcudf.table.table cimport table + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +): + """ + Partitions rows from the input table into multiple output tables. + + Parameters + ---------- + input : Table + The table to partition + columns_to_hash : list[int] + Indices of input columns to hash + num_partitions : int + The number of partitions to use + + Returns + ------- + tuple[Table, list[int]] + An output table and a vector of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + + with nogil: + c_result = move( + cpp_partitioning.hash_partition( + table.view(), columns_to_hash, num_partitions + ) + ) + + return Table.from_libcudf(move(c_result.first)), c_result.second + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions): + """ + Partitions rows of `t` according to the mapping specified by `partition_map`. + + Parameters + ---------- + t : Table + The table to partition + partition_map : list[int] + Non-nullable column of integer values that map each row + in `t` to it's partition. + num_partitions : int + The total number of partitions + + Returns + ------- + tuple[Table, list[int]] + An output table and a list of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + + with nogil: + c_result = move( + cpp_partitioning.partition(t.view(), partition_map.view(), num_partitions) + ) + + return Table.from_libcudf(move(c_result.first)), c_result.second + + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=0 +): + """ + Round-robin partition. + + Parameters + ---------- + input : Table + The input table to be round-robin partitioned + num_partitions : int + Number of partitions for the table + start_partition : int, default 0 + Index of the 1st partition + + Returns + ------- + tuple[Table, list[int]] + The partitioned table and the partition offsets + for each partition within the table. + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + + with nogil: + c_result = move( + cpp_partitioning.round_robin_partition( + table.view(), num_partitions, start_partition + ) + ) + + return Table.from_libcudf(move(c_result.first)), c_result.second From 9c606189d7cf1c4fc0c8448b89455c3f91ef8d83 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Sep 2024 14:30:45 -0700 Subject: [PATCH 2/7] fix compilation failures --- .../pylibcudf/libcudf/partitioning.pxd | 2 +- python/pylibcudf/pylibcudf/partitioning.pyx | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index aa42e5af007..89bddbffab5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -30,5 +30,5 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: round_robin_partition "cudf::round_robin_partition" ( const table_view& input, int num_partitions, - int start_partition=* + int start_partition ) except + diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 36e7a88e063..074137d0418 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -35,15 +35,17 @@ cpdef tuple[Table, list] hash_partition( An output table and a vector of row offsets to each partition """ cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash + cdef int c_num_partitions = num_partitions with nogil: c_result = move( cpp_partitioning.hash_partition( - table.view(), columns_to_hash, num_partitions + input.view(), c_columns_to_hash, c_num_partitions ) ) - return Table.from_libcudf(move(c_result.first)), c_result.second + return Table.from_libcudf(move(c_result.first)), list(c_result.second) cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions): """ @@ -65,13 +67,14 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit An output table and a list of row offsets to each partition """ cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions with nogil: c_result = move( - cpp_partitioning.partition(t.view(), partition_map.view(), num_partitions) + cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) ) - return Table.from_libcudf(move(c_result.first)), c_result.second + return Table.from_libcudf(move(c_result.first)), list(c_result.second) cpdef tuple[Table, list] round_robin_partition( @@ -98,12 +101,14 @@ cpdef tuple[Table, list] round_robin_partition( for each partition within the table. """ cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions + cdef int c_start_partition = start_partition with nogil: c_result = move( cpp_partitioning.round_robin_partition( - table.view(), num_partitions, start_partition + input.view(), c_num_partitions, c_start_partition ) ) - return Table.from_libcudf(move(c_result.first)), c_result.second + return Table.from_libcudf(move(c_result.first)), list(c_result.second) From 2fc2cd7f5081f1497f0dc2d20272dd30d368f52a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:52:59 -0700 Subject: [PATCH 3/7] Add unit tests --- python/cudf/cudf/_lib/hash.pyx | 35 ++++--------- python/cudf/cudf/_lib/partitioning.pyx | 35 +++---------- python/pylibcudf/pylibcudf/partitioning.pyx | 2 +- .../pylibcudf/tests/test_partitioning.py | 51 +++++++++++++++++++ 4 files changed, 68 insertions(+), 55 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_partitioning.py diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 48f75b12a73..9b7ab0888d2 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -3,11 +3,8 @@ from cudf.core.buffer import acquire_spill_lock from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair from libcpp.utility cimport move -from libcpp.vector cimport vector -cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.hash cimport ( md5, @@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport ( sha512, xxhash_64, ) -from pylibcudf.libcudf.partitioning cimport ( - hash_partition as cpp_hash_partition, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() -def hash_partition(list source_columns, object columns_to_hash, +def hash_partition(list source_columns, list columns_to_hash, int num_partitions): - cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result - with nogil: - c_result = move( - cpp_hash_partition( - c_source_view, - c_columns_to_hash, - c_num_partitions - ) - ) - - return ( - columns_from_unique_ptr(move(c_result.first)), - list(c_result.second) + plc_table, offsets = plc.partitioning.hash_partition( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + columns_to_hash, + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index d94f0e1b564..63fd26f0134 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -2,24 +2,13 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.partitioning cimport partition as cpp_partition -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns + +import pylibcudf as plc from cudf._lib.reduce import minmax from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count -cimport pylibcudf.libcudf.types as libcudf_types - @acquire_spill_lock() def partition(list source_columns, Column partition_map, @@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map, if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef column_view c_partition_map_view = partition_map.view() - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result if partition_map.size > 0: lo, hi = minmax(partition_map) if lo < 0 or hi >= num_partitions: raise ValueError("Partition map has invalid values") - with nogil: - c_result = move( - cpp_partition( - c_source_view, - c_partition_map_view, - c_num_partitions - ) - ) - return ( - columns_from_unique_ptr(move(c_result.first)), list(c_result.second) + plc_table, offsets = plc.partitioning.partitioning( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + partition_map.to_pylibcudf(mode="read"), + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 074137d0418..30386d68dc8 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -55,7 +55,7 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit ---------- t : Table The table to partition - partition_map : list[int] + partition_map : Column Non-nullable column of integer values that map each row in `t` to it's partition. num_partitions : int diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py new file mode 100644 index 00000000000..109f8bd47da --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.fixture +def pa_table(): + return pa.table({"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]}) + + +def test_partition(pa_table): + plc_result, result_offsets = plc.partitioning.partition( + plc.interop.from_arrow(pa_table), + plc.interop.from_arrow(pa.array([0, 0, 0])), + 1, + ) + pa_result = plc.interop.to_arrow(plc_result) + pa_expected = pa.table( + [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert pa_result.equals(pa_expected) + assert result_offsets == [0, 3] + + +def test_hash_partition(pa_table): + plc_result, result_offsets = plc.partitioning.hash_partition( + plc.interop.from_arrow(pa_table), [0, 1], 1 + ) + pa_result = plc.interop.to_arrow(plc_result) + pa_expected = pa.table( + [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert pa_result.equals(pa_expected) + assert result_offsets == [0] + + +def test_round_robin_partition(pa_table): + plc_result, result_offsets = plc.partitioning.round_robin_partition( + plc.interop.from_arrow(pa_table), 1, 0 + ) + pa_result = plc.interop.to_arrow(plc_result) + pa_expected = pa.table( + [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert pa_result.equals(pa_expected) + assert result_offsets == [0] From 540a3563f01843e4d3a66abfcb8553157a3e0b19 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:28:45 -0700 Subject: [PATCH 4/7] Typo --- python/cudf/cudf/_lib/partitioning.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index 63fd26f0134..13997da8403 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -45,7 +45,7 @@ def partition(list source_columns, Column partition_map, if lo < 0 or hi >= num_partitions: raise ValueError("Partition map has invalid values") - plc_table, offsets = plc.partitioning.partitioning( + plc_table, offsets = plc.partitioning.partition( plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), partition_map.to_pylibcudf(mode="read"), num_partitions From 36ae931bae9d74931ba0e8b6ad8b7be8f5f0a027 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 25 Sep 2024 09:40:25 -0700 Subject: [PATCH 5/7] address review --- .../pylibcudf/tests/test_partitioning.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 109f8bd47da..2e4f33a1bff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -3,49 +3,52 @@ import pyarrow as pa import pylibcudf as plc import pytest +from utils import assert_table_eq -@pytest.fixture -def pa_table(): - return pa.table({"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]}) +@pytest.fixture(scope="module") +def partitioning_data(): + data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]} + pa_table = pa.table(data) + return data, pa_table -def test_partition(pa_table): - plc_result, result_offsets = plc.partitioning.partition( +def test_partition(partitioning_data): + raw_data, pa_table = partitioning_data + result, result_offsets = plc.partitioning.partition( plc.interop.from_arrow(pa_table), plc.interop.from_arrow(pa.array([0, 0, 0])), 1, ) - pa_result = plc.interop.to_arrow(plc_result) - pa_expected = pa.table( - [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + expected = pa.table( + list(raw_data.values()), schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), ) - assert pa_result.equals(pa_expected) + assert_table_eq(expected, result) assert result_offsets == [0, 3] -def test_hash_partition(pa_table): - plc_result, result_offsets = plc.partitioning.hash_partition( +def test_hash_partition(partitioning_data): + raw_data, pa_table = partitioning_data + result, result_offsets = plc.partitioning.hash_partition( plc.interop.from_arrow(pa_table), [0, 1], 1 ) - pa_result = plc.interop.to_arrow(plc_result) - pa_expected = pa.table( - [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + expected = pa.table( + list(raw_data.values()), schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), ) - assert pa_result.equals(pa_expected) + assert_table_eq(expected, result) assert result_offsets == [0] -def test_round_robin_partition(pa_table): - plc_result, result_offsets = plc.partitioning.round_robin_partition( +def test_round_robin_partition(partitioning_data): + raw_data, pa_table = partitioning_data + result, result_offsets = plc.partitioning.round_robin_partition( plc.interop.from_arrow(pa_table), 1, 0 ) - pa_result = plc.interop.to_arrow(plc_result) - pa_expected = pa.table( - [[1, 2, 3], [1, 2, 5], [1, 2, 10]], + expected = pa.table( + list(raw_data.values()), schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), ) - assert pa_result.equals(pa_expected) + assert_table_eq(expected, result) assert result_offsets == [0] From eda42a69e99ec8f9e78fdeb08493e6d723ca2d48 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 25 Sep 2024 10:05:35 -0700 Subject: [PATCH 6/7] add ref to cpp func --- python/pylibcudf/pylibcudf/partitioning.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 30386d68dc8..8fa70daab5a 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -20,6 +20,8 @@ cpdef tuple[Table, list] hash_partition( """ Partitions rows from the input table into multiple output tables. + For details, see :cpp:func:`hash_partition`. + Parameters ---------- input : Table @@ -51,6 +53,8 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit """ Partitions rows of `t` according to the mapping specified by `partition_map`. + For details, see :cpp:func:`partition`. + Parameters ---------- t : Table @@ -85,6 +89,8 @@ cpdef tuple[Table, list] round_robin_partition( """ Round-robin partition. + For details, see :cpp:func:`round_robin_partition`. + Parameters ---------- input : Table From 67f01cedfec0d8ff43af56b2bb20627022333f93 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 25 Sep 2024 15:57:56 -0700 Subject: [PATCH 7/7] Update test_partitioning.py --- .../pylibcudf/tests/test_partitioning.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 2e4f33a1bff..444d0089d2c 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -10,13 +10,14 @@ def partitioning_data(): data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]} pa_table = pa.table(data) - return data, pa_table + plc_table = plc.interop.from_arrow(pa_table) + return data, plc_table, pa_table def test_partition(partitioning_data): - raw_data, pa_table = partitioning_data + raw_data, plc_table, pa_table = partitioning_data result, result_offsets = plc.partitioning.partition( - plc.interop.from_arrow(pa_table), + plc_table, plc.interop.from_arrow(pa.array([0, 0, 0])), 1, ) @@ -29,9 +30,9 @@ def test_partition(partitioning_data): def test_hash_partition(partitioning_data): - raw_data, pa_table = partitioning_data + raw_data, plc_table, pa_table = partitioning_data result, result_offsets = plc.partitioning.hash_partition( - plc.interop.from_arrow(pa_table), [0, 1], 1 + plc_table, [0, 1], 1 ) expected = pa.table( list(raw_data.values()), @@ -42,9 +43,9 @@ def test_hash_partition(partitioning_data): def test_round_robin_partition(partitioning_data): - raw_data, pa_table = partitioning_data + raw_data, plc_table, pa_table = partitioning_data result, result_offsets = plc.partitioning.round_robin_partition( - plc.interop.from_arrow(pa_table), 1, 0 + plc_table, 1, 0 ) expected = pa.table( list(raw_data.values()),