rapidsai · rapids-bot · Nov 21, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024
@@ -1,10 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport pylibcudf.libcudf.types as libcudf_types
 
@@ -23,23 +19,16 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.io.csv cimport (
-    csv_writer_options,
-    write_csv as cpp_write_csv,
-)
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport sink_info
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.io.utils cimport make_sink_info
-from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
 import pylibcudf as plc
 
 from cudf.api.types import is_hashable
 
 from pylibcudf.types cimport DataType
 
+from cudf._lib.json import _dtype_to_names_list
+
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
     "hex64": np.dtype("int64"),
@@ -318,59 +307,28 @@ def write_csv(
     --------
     cudf.to_csv
     """
-    cdef table_view input_table_view = table_view_from_table(
-        table, not index
-    )
-    cdef bool include_header_c = header
-    cdef char delim_c = ord(sep)
-    cdef string line_term_c = lineterminator.encode()
-    cdef string na_c = na_rep.encode()
-    cdef int rows_per_chunk_c = rows_per_chunk
-    cdef vector[string] col_names
-    cdef string true_value_c = 'True'.encode()
-    cdef string false_value_c = 'False'.encode()
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-
-    if header is True:
-        all_names = columns_apply_na_rep(table._column_names, na_rep)
-        if index is True:
-            all_names = table._index.names + all_names
-
-        if len(all_names) > 0:
-            col_names.reserve(len(all_names))
-            if len(all_names) == 1:
-                if all_names[0] in (None, ''):
-                    col_names.push_back('""'.encode())
-                else:
-                    col_names.push_back(
-                        str(all_names[0]).encode()
-                    )
-            else:
-                for idx, col_name in enumerate(all_names):
-                    if col_name is None:
-                        col_names.push_back(''.encode())
-                    else:
-                        col_names.push_back(
-                            str(col_name).encode()
-                        )
-
-    cdef csv_writer_options options = move(
-        csv_writer_options.builder(sink_info_c, input_table_view)
-        .names(col_names)
-        .na_rep(na_c)
-        .include_header(include_header_c)
-        .rows_per_chunk(rows_per_chunk_c)
-        .line_terminator(line_term_c)
-        .inter_column_delimiter(delim_c)
-        .true_value(true_value_c)
-        .false_value(false_value_c)
-        .build()
-    )
-
+    col_names = []
+    for name in table._column_names:
+        col_names.append((name, _dtype_to_names_list(table[name]._column)))
+    for i, t in enumerate(col_names):
+        if t[0] is None or pd.isnull(t[0]):
+            col_names[i] = (na_rep, t[1])
+    columns = [col.to_pylibcudf(mode="read") for col in table._columns]
     try:
-        with nogil:
-            cpp_write_csv(options)
+        plc.io.csv.write_csv(
+            plc.io.SinkInfo([path_or_buf]),
+            plc.io.TableWithMetadata(
+                plc.Table(columns),
+                col_names
+            ),
+            path_or_buf=path_or_buf,
+            sep=str(sep),
+            na_rep=str(na_rep),
+            header=header,
+            lineterminator=str(lineterminator),
+            rows_per_chunk=rows_per_chunk,
+            indices=table._index if index else None,
+        )
     except OverflowError:
         raise OverflowError(
             f"Writing CSV file with chunksize={rows_per_chunk} failed. "
@@ -419,11 +377,3 @@ cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
 
     dtype = cudf.dtype(dtype)
     return dtype_to_pylibcudf_type(dtype)
-
-
-def columns_apply_na_rep(column_names, na_rep):
-    return tuple(
-        na_rep if pd.isnull(col_name)
-        else col_name
-        for col_name in column_names
-    )
@@ -2,22 +2,26 @@
 
 from libcpp cimport bool
 from libcpp.map cimport map
+
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.io.types cimport SourceInfo, SinkInfo, TableWithMetadata
 from pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
+    csv_writer_options,
     read_csv as cpp_read_csv,
+    write_csv as cpp_write_csv,
 )
+
 from pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     table_with_metadata,
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
-
+from pylibcudf.table cimport Table
 
 cdef tuple _process_parse_dates_hex(list cols):
     cdef vector[string] str_cols
@@ -80,6 +84,8 @@ def read_csv(
 ):
     """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`.
 
+    For details, see :cpp:func:`read_csv`.
+
     Parameters
     ----------
     source_info : SourceInfo
@@ -261,3 +267,97 @@ def read_csv(
         c_result = move(cpp_read_csv(options))
 
     return TableWithMetadata.from_libcudf(c_result)
+
+
+def write_csv(
+    SinkInfo sink_info,
+    TableWithMetadata table,
+    *,
+    object path_or_buf=None,
+    str sep=",",
+    str na_rep="",
+    bool header=True,
+    str lineterminator="\n",
+    int rows_per_chunk=8,
+    object indices=None,
+):
+    """
+    Writes a :py:class:`~pylibcudf.io.types.TableWithMetadata` to CSV format.
+
+    For details, see :cpp:func:`write_csv`.
+
+    Parameters
+    ----------
+    sink_info: SinkInfo
+        The SinkInfo object to write to.
+    table : TableWithMetadata
+        The TableWithMetadata object containing the Table to write.
+    path_or_buf : object, default None
+        The source file-like object (eg. io.StringIO).
+    sep : str
+        Character to delimit column values.
+    na_rep : str
+        The string representation for null values.
+    header : bool, default True
+        Whether to write headers to csv. Includes the column names
+        and optionally, the index names (see ``index`` argument).
+    lineterminator : str, default '\\n'
+        The character used to determine the end of a line.
+    rows_per_chunk: int, default 8
+        The maximum number of rows to write at a time.
+    indices : object
+        The indices in the table.
+    """
+    cdef bool include_header_c = header
+    cdef char delim_c = ord(sep)
+    cdef string line_term_c = lineterminator.encode()
+    cdef string na_c = na_rep.encode()
+    cdef int rows_per_chunk_c = rows_per_chunk
+    cdef vector[string] col_names
+    cdef string true_value_c = 'True'.encode()
+    cdef string false_value_c = 'False'.encode()
+
+    if header is True:
+        all_names = table.column_names()
+        if indices is not None:
+            all_names = indices.names + all_names
+        if len(all_names) > 0:
+            col_names.reserve(len(all_names))
+            if len(all_names) == 1:
+                if all_names[0] in (None, ''):
+                    col_names.push_back('""'.encode())
+                else:
+                    col_names.push_back(
+                        str(all_names[0]).encode()
+                    )
+            else:
+                for idx, col_name in enumerate(all_names):
+                    if col_name is None:
+                        col_names.push_back(''.encode())
+                    else:
+                        col_names.push_back(
+                            str(col_name).encode()
+                        )
+    cdef Table new_table = table.tbl
+    if indices is not None:
+        new_table = Table(
+            [col.to_pylibcudf(mode="read") for col in indices._columns] + table.columns
+        )
+    cdef csv_writer_options options = move(
+        csv_writer_options.builder(
+            sink_info.c_obj,
+            new_table.view()
+        )
+        .names(col_names)
+        .na_rep(na_c)
+        .include_header(include_header_c)
+        .rows_per_chunk(rows_per_chunk_c)
+        .line_terminator(line_term_c)
+        .inter_column_delimiter(delim_c)
+        .true_value(true_value_c)
+        .false_value(false_value_c)
+        .build()
+    )
+
+    with nogil:
+        cpp_write_csv(options)
@@ -267,25 +267,30 @@ cdef class SinkInfo:
 
         cdef object initial_sink_cls = type(sinks[0])
 
-        if not all(isinstance(s, initial_sink_cls) for s in sinks):
+        if not all(isinstance(s, initial_sink_cls) or
+                   (isinstance(sinks[0], io.IOBase) and isinstance(s, io.IOBase))
+                   for s in sinks):
             raise ValueError("All sinks must be of the same type!")
 
-        if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}:
+        if isinstance(sinks[0], io.IOBase):
             data_sinks.reserve(len(sinks))
-            if isinstance(sinks[0], (io.StringIO, io.BytesIO)):
-                for s in sinks:
+            for s in sinks:
+                if isinstance(s, (io.StringIO, io.BytesIO)):
                     self.sink_storage.push_back(
                         unique_ptr[data_sink](new iobase_data_sink(s))
                     )
-            elif isinstance(sinks[0], io.TextIOBase):
-                for s in sinks:
-                    if codecs.lookup(s).name not in ('utf-8', 'ascii'):
+                elif isinstance(s, io.TextIOBase):
+                    if codecs.lookup(s.encoding).name not in ('utf-8', 'ascii'):
                         raise NotImplementedError(f"Unsupported encoding {s.encoding}")
                     self.sink_storage.push_back(
                         unique_ptr[data_sink](new iobase_data_sink(s.buffer))
                     )
-            data_sinks.push_back(self.sink_storage.back().get())
-        elif initial_sink_cls is str:
+                else:
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s))
+                    )
+                data_sinks.push_back(self.sink_storage.back().get())
+        elif isinstance(sinks[0], str):
             paths.reserve(len(sinks))
             for s in sinks:
                 paths.push_back(<string> s.encode())

@@ -385,12 +385,10 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
     NESTED_STRUCT_TESTING_TYPE,
 ]
 
+NON_NESTED_PA_TYPES = NUMERIC_PA_TYPES + STRING_PA_TYPES + BOOL_PA_TYPES
+
 DEFAULT_PA_TYPES = (
-    NUMERIC_PA_TYPES
-    + STRING_PA_TYPES
-    + BOOL_PA_TYPES
-    + LIST_PA_TYPES
-    + DEFAULT_PA_STRUCT_TESTING_TYPES
+    NON_NESTED_PA_TYPES + LIST_PA_TYPES + DEFAULT_PA_STRUCT_TESTING_TYPES
 )
 
 # Map pylibcudf compression types to pandas ones