diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 3bc56ddffc3..6e596151871 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf. copying gpumemoryview groupby + join scalar table types diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst new file mode 100644 index 00000000000..05b9709d116 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst @@ -0,0 +1,6 @@ +==== +join +==== + +.. automodule:: cudf._lib.pylibcudf.join + :members: diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd index 171658c78ee..ea05256430a 100644 --- a/python/cudf/cudf/_lib/cpp/join.pxd +++ b/python/cudf/cudf/_lib/cpp/join.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -13,19 +13,20 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type ctypedef unique_ptr[device_uvector[size_type]] gather_map_type +ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type cdef extern from "cudf/join.hpp" namespace "cudf" nogil: - cdef pair[gather_map_type, gather_map_type] inner_join( + cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, ) except + - cdef pair[gather_map_type, gather_map_type] left_join( + cdef gather_map_pair_type left_join( const table_view left_keys, const table_view right_keys, ) except + - cdef pair[gather_map_type, gather_map_type] full_join( + cdef gather_map_pair_type full_join( const table_view left_keys, const table_view right_keys, ) except + diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 378be978cc0..65f2f8cdcc8 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -1,19 +1,10 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move - -from rmm._lib.device_buffer cimport device_buffer - -cimport cudf._lib.cpp.join as cpp_join from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport data_type, size_type, type_id -from cudf._lib.utils cimport table_view_from_columns + +from cudf._lib import pylibcudf # The functions below return the *gathermaps* that represent # the join result when joining on the keys `lhs` and `rhs`. @@ -21,53 +12,30 @@ from cudf._lib.utils cimport table_view_from_columns @acquire_spill_lock() def join(list lhs, list rhs, how=None): - cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result - cdef table_view c_lhs = table_view_from_columns(lhs) - cdef table_view c_rhs = table_view_from_columns(rhs) - - if how == "inner": - with nogil: - c_result = move(cpp_join.inner_join(c_lhs, c_rhs)) - elif how == "left": - with nogil: - c_result = move(cpp_join.left_join(c_lhs, c_rhs)) - elif how == "outer": - with nogil: - c_result = move(cpp_join.full_join(c_lhs, c_rhs)) - else: + if how == "outer": + how = "full" + if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None: raise ValueError(f"Invalid join type {how}") - cdef Column left_rows = _gather_map_as_column(move(c_result.first)) - cdef Column right_rows = _gather_map_as_column(move(c_result.second)) - return left_rows, right_rows + left_rows, right_rows = join_func( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), + ) + return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows) @acquire_spill_lock() def semi_join(list lhs, list rhs, how=None): - # left-semi and left-anti joins - cdef cpp_join.gather_map_type c_result - cdef table_view c_lhs = table_view_from_columns(lhs) - cdef table_view c_rhs = table_view_from_columns(rhs) - - if how == "leftsemi": - with nogil: - c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs)) - elif how == "leftanti": - with nogil: - c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs)) - else: + if ( + join_func := getattr( + pylibcudf.join, f"{how.replace('left', 'left_')}_join", None + ) + ) is None: raise ValueError(f"Invalid join type {how}") - cdef Column left_rows = _gather_map_as_column(move(c_result)) - return left_rows, None - - -cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map): - # help to convert a gather map to a Column - cdef device_buffer c_empty - cdef size_type size = gather_map.get()[0].size() - cdef unique_ptr[column] c_col = move(make_unique[column]( - data_type(type_id.INT32), - size, - gather_map.get()[0].release(), move(c_empty), 0)) - return Column.from_unique_ptr(move(c_col)) + return Column.from_pylibcudf( + join_func( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), + ) + ), None diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 432617681db..da5645b5947 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx - groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx +set(cython_sources + aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx groupby.pyx interop.pyx + join.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5cd8f017372..bbe491f43e3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport aggregation, binaryop, copying, groupby, interop, unary +from . cimport aggregation, binaryop, copying, groupby, interop, join, unary from .column cimport Column from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -21,6 +21,7 @@ __all__ = [ "gpumemoryview", "groupby", "interop", + "join", "unary", "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 6f1eb0b6b67..35812b65046 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from . import aggregation, binaryop, copying, groupby, interop, unary +from . import aggregation, binaryop, copying, groupby, interop, join, unary from .column import Column from .gpumemoryview import gpumemoryview from .scalar import Scalar @@ -19,6 +19,7 @@ "gpumemoryview", "groupby", "interop", + "join", "unary", "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index d6ce9825ed3..b8cc59eed09 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -144,6 +144,9 @@ cdef class GroupBy: c_requests.push_back(move(request._to_libcudf_agg_request())) cdef pair[unique_ptr[table], vector[aggregation_result]] c_res + # TODO: Need to capture C++ exceptions indicating that an invalid type was used. + # We rely on libcudf to tell us this rather than checking the types beforehand + # ourselves. with nogil: c_res = move(dereference(self.c_obj).aggregate(c_requests)) return GroupBy._parse_outputs(move(c_res)) diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd new file mode 100644 index 00000000000..4014dd4a399 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef tuple inner_join(Table left_keys, Table right_keys) + +cpdef tuple left_join(Table left_keys, Table right_keys) + +cpdef tuple full_join(Table left_keys, Table right_keys) + +cpdef Column left_semi_join(Table left_keys, Table right_keys) + +cpdef Column left_anti_join(Table left_keys, Table right_keys) diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx new file mode 100644 index 00000000000..e1b61dabe22 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx @@ -0,0 +1,159 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator import dereference + +from libcpp.memory cimport make_unique +from libcpp.utility cimport move + +from rmm._lib.device_buffer cimport device_buffer + +from cudf._lib.cpp cimport join as cpp_join +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport data_type, size_type, type_id + +from .column cimport Column +from .table cimport Table + + +cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): + # helper to convert a gather map to a Column + cdef device_buffer c_empty + cdef size_type size = dereference(gather_map.get()).size() + return Column.from_libcudf( + move( + make_unique[column]( + data_type(type_id.INT32), + size, + dereference(gather_map.get()).release(), + move(c_empty), + 0 + ) + ) + ) + + +cpdef tuple inner_join(Table left_keys, Table right_keys): + """Perform an inner join between two tables. + + For details, see :cpp:func:`inner_join`. + + Parameters + ---------- + left_keys : Table + The left table to join. + right_keys : Table + The right table to join. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.inner_join(left_keys.view(), right_keys.view()) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple left_join(Table left_keys, Table right_keys): + """Perform a left join between two tables. + + For details, see :cpp:func:`left_join`. + + Parameters + ---------- + left_keys : Table + The left table to join. + right_keys : Table + The right table to join. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.left_join(left_keys.view(), right_keys.view()) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple full_join(Table left_keys, Table right_keys): + """Perform a full join between two tables. + + For details, see :cpp:func:`full_join`. + + Parameters + ---------- + left_keys : Table + The left table to join. + right_keys : Table + The right table to join. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.full_join(left_keys.view(), right_keys.view()) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef Column left_semi_join(Table left_keys, Table right_keys): + """Perform a left semi join between two tables. + + For details, see :cpp:func:`left_semi_join`. + + Parameters + ---------- + left_keys : Table + The left table to join. + right_keys : Table + The right table to join. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.left_semi_join(left_keys.view(), right_keys.view()) + return _column_from_gather_map(move(c_result)) + + +cpdef Column left_anti_join(Table left_keys, Table right_keys): + """Perform a left anti join between two tables. + + For details, see :cpp:func:`left_anti_join`. + + Parameters + ---------- + left_keys : Table + The left table to join. + right_keys : Table + The right table to join. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.left_anti_join(left_keys.view(), right_keys.view()) + return _column_from_gather_map(move(c_result))