From ba21673b93c7ba83f2b0dc76f2294535f684f120 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 18 Nov 2024 10:46:14 -0800
Subject: [PATCH] Remove cudf._lib.concat in favor of inlining pylibcudf
 (#17344)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17344
---
 python/cudf/cudf/_lib/CMakeLists.txt        |  1 -
 python/cudf/cudf/_lib/__init__.py           |  1 -
 python/cudf/cudf/_lib/concat.pyx            | 35 ---------------------
 python/cudf/cudf/_lib/utils.pxd             |  2 +-
 python/cudf/cudf/_lib/utils.pyx             |  2 +-
 python/cudf/cudf/core/column/categorical.py |  4 +--
 python/cudf/cudf/core/column/column.py      |  9 +++++-
 python/cudf/cudf/core/dataframe.py          | 29 ++++++++++++++---
 8 files changed, 36 insertions(+), 47 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/concat.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2fc82a57a6f..13beec3c7f7 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -16,7 +16,6 @@ set(cython_sources
     aggregation.pyx
     binaryop.pyx
     column.pyx
-    concat.pyx
     copying.pyx
     csv.pyx
     datetime.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cd86767f0cd..a63bc1a3d1c 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -3,7 +3,6 @@
 
 from . import (
     binaryop,
-    concat,
     copying,
     csv,
     datetime,
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
deleted file mode 100644
index e6c2d136f0d..00000000000
--- a/python/cudf/cudf/_lib/concat.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
-import pylibcudf
-
-from cudf.core.buffer import acquire_spill_lock
-
-
-@acquire_spill_lock()
-def concat_columns(object columns):
-    return Column.from_pylibcudf(
-        pylibcudf.concatenate.concatenate(
-            [col.to_pylibcudf(mode="read") for col in columns]
-        )
-    )
-
-
-@acquire_spill_lock()
-def concat_tables(object tables, bool ignore_index=False):
-    plc_tables = []
-    for table in tables:
-        cols = table._columns
-        if not ignore_index:
-            cols = table._index._columns + cols
-        plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
-
-    return data_from_pylibcudf_table(
-        pylibcudf.concatenate.concatenate(plc_tables),
-        column_names=tables[0]._column_names,
-        index_names=None if ignore_index else tables[0]._index_names
-    )
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 623c5064a1a..f273aeb4270 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 292de82e4c4..2ccc6ca34dc 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -309,7 +309,7 @@ cdef data_from_unique_ptr(
     )
 
 
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
     return _data_from_columns(
         columns_from_pylibcudf_table(tbl),
         column_names,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index b7d5e8658a0..7354b917f90 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1204,9 +1204,7 @@ def _concat(
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype, masked=True)
         else:
-            # Filter out inputs that have 0 length, then concatenate.
-            codes = [o for o in codes if len(o)]
-            codes_col = libcudf.concat.concat_columns(objs)
+            codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
         codes_col = as_unsigned_codes(
             len(cats),
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 03dcf6bec1e..f6eaea4b783 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -19,6 +19,7 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 from typing_extensions import Self
 
+import pylibcudf as plc
 import rmm
 
 import cudf
@@ -2300,4 +2301,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         return column_empty(0, head.dtype, masked=True)
 
     # Filter out inputs that have 0 length, then concatenate.
-    return libcudf.concat.concat_columns([o for o in objs if len(o)])
+    objs_with_len = [o for o in objs if len(o)]
+    with acquire_spill_lock():
+        return Column.from_pylibcudf(
+            plc.concatenate.concatenate(
+                [col.to_pylibcudf(mode="read") for col in objs_with_len]
+            )
+        )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9be5aabb4e2..bd78d5dd9f1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1787,11 +1787,32 @@ def _concat(
             )
 
         # Concatenate the Tables
-        out = cls._from_data(
-            *libcudf.concat.concat_tables(
-                tables, ignore_index=ignore_index or are_all_range_index
+        ignore = ignore_index or are_all_range_index
+        index_names = None if ignore else tables[0]._index_names
+        column_names = tables[0]._column_names
+        with acquire_spill_lock():
+            plc_tables = [
+                plc.Table(
+                    [
+                        c.to_pylibcudf(mode="read")
+                        for c in (
+                            table._columns
+                            if ignore
+                            else itertools.chain(
+                                table._index._columns, table._columns
+                            )
+                        )
+                    ]
+                )
+                for table in tables
+            ]
+
+            concatted = libcudf.utils.data_from_pylibcudf_table(
+                plc.concatenate.concatenate(plc_tables),
+                column_names=column_names,
+                index_names=index_names,
             )
-        )
+        out = cls._from_data(*concatted)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex