Skip to content

Commit

Permalink
Remove cudf._lib.text in favor of inlining pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke committed Nov 21, 2024
1 parent 78db66b commit ef45bb5
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 66 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ set(cython_sources
stream_compaction.pyx
string_casting.pyx
strings_udf.pyx
text.pyx
timezone.pyx
transform.pyx
transpose.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
string_casting,
strings,
strings_udf,
text,
timezone,
transpose,
)
Expand Down
53 changes: 0 additions & 53 deletions python/cudf/cudf/_lib/text.pyx

This file was deleted.

45 changes: 34 additions & 11 deletions python/cudf/cudf/io/text.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from io import BytesIO, StringIO
from io import BytesIO, StringIO, TextIOBase

import pylibcudf as plc

import cudf
from cudf._lib import text as libtext
from cudf.utils import ioutils
from cudf.utils.performance_tracking import _performance_tracking

Expand Down Expand Up @@ -33,13 +34,35 @@ def read_text(
filepath_or_buffer, "read_text"
)

return cudf.Series._from_column(
libtext.read_text(
filepath_or_buffer,
delimiter=delimiter,
byte_range=byte_range,
strip_delimiters=strip_delimiters,
compression=compression,
compression_offsets=compression_offsets,
)
if compression is None:
if isinstance(filepath_or_buffer, TextIOBase):
datasource = plc.io.text.make_source(filepath_or_buffer.read())
else:
datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
elif compression == "bgzip":
if isinstance(filepath_or_buffer, TextIOBase):
raise ValueError("bgzip compression requires a file path")
if compression_offsets is not None:
if len(compression_offsets) != 2:
raise ValueError(
"compression offsets need to consist of two elements"
)
datasource = plc.io.text.make_source_from_bgzip_file(
filepath_or_buffer,
compression_offsets[0],
compression_offsets[1],
)
else:
datasource = plc.io.text.make_source_from_bgzip_file(
filepath_or_buffer,
)
else:
raise ValueError("Only bgzip compression is supported at the moment")

options = plc.io.text.ParseOptions(
byte_range=byte_range, strip_delimiters=strip_delimiters
)
plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
result = cudf._lib.column.Column.from_pylibcudf(plc_column)

return cudf.Series._from_column(result)

0 comments on commit ef45bb5

Please sign in to comment.